From e68ee1af261f70ee5ee5fe11d656b246ffb0835a Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 12 Oct 2021 23:19:19 -0700
Subject: [PATCH 1/7] Add Allocator abstraction

Add an Allocator abstraction that looks similar to umpire.
Use this abstraction in the cuda/hip backends.
Refactor basic_mempool into AllocatorPool as the default
cuda/hip Allocators.
Add support for users to set new Allocators for cuda/hip.
---
 include/RAJA/RAJA.hpp                      |   2 +-
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 171 +++++--
 include/RAJA/policy/cuda/reduce.hpp        |  28 +-
 include/RAJA/policy/cuda/scan.hpp          |  16 +-
 include/RAJA/policy/cuda/sort.hpp          |  40 +-
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 171 +++++--
 include/RAJA/policy/hip/reduce.hpp         |  28 +-
 include/RAJA/policy/hip/scan.hpp           |  16 +-
 include/RAJA/policy/hip/sort.hpp           |  40 +-
 include/RAJA/policy/sycl/MemUtils_SYCL.hpp |   1 -
 include/RAJA/util/Allocator.hpp            | 100 ++++
 include/RAJA/util/AllocatorPool.hpp        | 518 +++++++++++++++++++++
 include/RAJA/util/SoAPtr.hpp               |  40 +-
 include/RAJA/util/basic_mempool.hpp        | 427 -----------------
 14 files changed, 1001 insertions(+), 597 deletions(-)
 create mode 100644 include/RAJA/util/Allocator.hpp
 create mode 100644 include/RAJA/util/AllocatorPool.hpp
 delete mode 100644 include/RAJA/util/basic_mempool.hpp
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 90cb5515f0..9945bd66e0 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -29,7 +29,7 @@
 #include "RAJA/config.hpp"
 
 #include "RAJA/util/Operators.hpp"
-#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/Allocator.hpp"
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 89bbea87d4..b7275d044c 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -28,10 +28,12 @@
 #include <cstdio>
 #include <type_traits>
 #include <unordered_map>
+#include <memory>
 
 #include "nvToolsExt.h"
 
-#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/Allocator.hpp"
+#include "RAJA/util/AllocatorPool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/macros.hpp"
@@ -46,70 +48,175 @@ namespace RAJA
 namespace cuda
 {
 
+namespace detail
+{
 
-//! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+//! Allocator for device memory for use in AllocatorPool
+struct DeviceBaseAllocator {
 
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes)
+  void* allocate(size_t nbytes)
   {
     void* ptr;
-    cudaErrchk(cudaHostAlloc(&ptr, nbytes, cudaHostAllocMapped));
+    cudaErrchk(cudaMalloc(&ptr, nbytes));
     return ptr;
   }
 
-  // returns true on success, false on failure
-  bool free(void* ptr)
+  void deallocate(void* ptr)
   {
-    cudaErrchk(cudaFreeHost(ptr));
-    return true;
+    cudaErrchk(cudaFree(ptr));
   }
 };
 
-//! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+//! Allocator for pre-zeroed device memory for use in AllocatorPool
+//  Note: Memory must be zero when returned to mempool
+struct DeviceZeroedBaseAllocator {
 
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes)
+  void* allocate(size_t nbytes)
   {
     void* ptr;
     cudaErrchk(cudaMalloc(&ptr, nbytes));
+    cudaErrchk(cudaMemset(ptr, 0, nbytes));
     return ptr;
   }
 
-  // returns true on success, false on failure
-  bool free(void* ptr)
+  void deallocate(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
-    return true;
   }
 };
 
-//! Allocator for pre-zeroed device memory for use in basic_mempool
-//  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+//! Allocator for pinned memory for use in AllocatorPool
+struct PinnedBaseAllocator {
 
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes)
+  void* allocate(size_t nbytes)
   {
     void* ptr;
-    cudaErrchk(cudaMalloc(&ptr, nbytes));
-    cudaErrchk(cudaMemset(ptr, 0, nbytes));
+    cudaErrchk(cudaHostAlloc(&ptr, nbytes, cudaHostAllocMapped));
     return ptr;
   }
 
-  // returns true on success, false on failure
-  bool free(void* ptr)
+  void deallocate(void* ptr)
   {
-    cudaErrchk(cudaFree(ptr));
-    return true;
+    cudaErrchk(cudaFreeHost(ptr));
   }
 };
 
-using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
-using device_zeroed_mempool_type =
-    basic_mempool::MemPool<DeviceZeroedAllocator>;
-using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
+//! Make default allocators used by RAJA internally
+inline std::unique_ptr<RAJA::Allocator> make_default_device_allocator()
+{
+  return std::unique_ptr<RAJA::Allocator>(
+      new AllocatorPool<DeviceBaseAllocator>(
+        std::string("RAJA::cuda::default_device_allocator"),
+        DeviceBaseAllocator()));
+}
+///
+inline std::unique_ptr<RAJA::Allocator> make_default_device_zeroed_allocator()
+{
+  return std::unique_ptr<RAJA::Allocator>(
+      new AllocatorPool<DeviceZeroedBaseAllocator>(
+        std::string("RAJA::cuda::default_device_zeroed_allocator"),
+        DeviceZeroedBaseAllocator()));
+}
+///
+inline std::unique_ptr<RAJA::Allocator> make_default_pinned_allocator()
+{
+  return std::unique_ptr<RAJA::Allocator>(
+      new AllocatorPool<PinnedBaseAllocator>(
+        std::string("RAJA::cuda::default_pinned_allocator"),
+        PinnedBaseAllocator()));
+}
+
+//! Storage for allocators used by RAJA internally
+inline std::unique_ptr<RAJA::Allocator>& get_device_allocator()
+{
+  static std::unique_ptr<RAJA::Allocator> allocator(
+      make_default_device_allocator());
+  return allocator;
+}
+///
+inline std::unique_ptr<RAJA::Allocator>& get_device_zeroed_allocator()
+{
+  static std::unique_ptr<RAJA::Allocator> allocator(
+      make_default_device_zeroed_allocator());
+  return allocator;
+}
+///
+inline std::unique_ptr<RAJA::Allocator>& get_pinned_allocator()
+{
+  static std::unique_ptr<RAJA::Allocator> allocator(
+      make_default_pinned_allocator());
+  return allocator;
+}
+
+} // namespace detail
+
+//! Sets the allocator used by RAJA internally by making an allocator of
+//  allocator_type with the given arguments. It is an error to change the
+//  allocator when any memory is allocated. This routine is not thread safe.
+template < typename allocator_type, typename ... Args >
+inline void set_device_allocator(Args&&... args)
+{
+  detail::get_device_allocator().release();
+  detail::get_device_allocator().reset(
+      new allocator_type(std::forward<Args>(args)...));
+}
+///
+template < typename allocator_type, typename ... Args >
+inline void set_device_zeroed_allocator(Args&&... args)
+{
+  detail::get_device_zeroed_allocator().release();
+  detail::get_device_zeroed_allocator().reset(
+      new allocator_type(std::forward<Args>(args)...));
+}
+///
+template < typename allocator_type, typename ... Args >
+inline void set_pinned_allocator(Args&&... args)
+{
+  detail::get_pinned_allocator().release();
+  detail::get_pinned_allocator().reset(
+      new allocator_type(std::forward<Args>(args)...));
+}
+
+//! Reset the allocator used by RAJA internally. This will destroy any existing
+//  allocator and replace it with the kind of allocator used by default. It is
+//  an error to change the allocator when any memory is allocated. This routine
+//  is not thread safe.
+inline void reset_device_allocator()
+{
+  detail::get_device_allocator().release();
+  detail::get_device_allocator() =
+      detail::make_default_device_allocator();
+}
+inline void reset_device_zeroed_allocator()
+{
+  detail::get_device_zeroed_allocator().release();
+  detail::get_device_zeroed_allocator() =
+      detail::make_default_device_zeroed_allocator();
+}
+inline void reset_pinned_allocator()
+{
+  detail::get_pinned_allocator().release();
+  detail::get_pinned_allocator() =
+      detail::make_default_pinned_allocator();
+}
+
+//! Gets the allocator used by RAJA internally. This allows the user to query
+//  the memory stats of the allocator.
+inline RAJA::Allocator& get_device_allocator()
+{
+  return *detail::get_device_allocator();
+}
+///
+inline RAJA::Allocator& get_device_zeroed_allocator()
+{
+  return *detail::get_device_zeroed_allocator();
+}
+///
+inline RAJA::Allocator& get_pinned_allocator()
+{
+  return *detail::get_pinned_allocator();
+}
+
 
 namespace detail
 {
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 9c0e358b6a..d9f6fd722e 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -32,7 +32,7 @@
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/SoAArray.hpp"
 #include "RAJA/util/SoAPtr.hpp"
-#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/Allocator.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -695,7 +695,7 @@ class PinnedTally
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = cuda::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    Node* n = cuda::get_pinned_allocator().template allocate<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
     return &n->value;
@@ -718,7 +718,7 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        cuda::pinned_mempool_type::getInstance().free(n);
+        cuda::get_pinned_allocator().deallocate(n);
       }
       resource_list = rn->next;
       free(rn);
@@ -751,7 +751,7 @@ struct Reduce_Data {
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
+  RAJA::detail::SoAPtr<T> device;
   bool own_device_ptr;
 
   Reduce_Data() : Reduce_Data(T(), T()){};
@@ -813,9 +813,9 @@ struct Reduce_Data {
     if (act) {
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device.allocate(cuda::get_device_allocator(), numBlocks);
+      device_count = cuda::get_device_zeroed_allocator()
+                         .template allocate<unsigned int>(1);
       own_device_ptr = true;
     }
     return act;
@@ -827,8 +827,8 @@ struct Reduce_Data {
   {
     bool act = own_device_ptr;
     if (act) {
-      device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      device.deallocate(cuda::get_device_allocator());
+      cuda::get_device_zeroed_allocator().deallocate(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -901,9 +901,9 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device = cuda::get_device_allocator().template allocate<T>(1);
+      device_count = cuda::get_device_zeroed_allocator()
+                         .template allocate<unsigned int>(1);
       own_device_ptr = true;
     }
     return act;
@@ -915,9 +915,9 @@ struct ReduceAtomic_Data {
   {
     bool act = own_device_ptr;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      cuda::get_device_allocator().deallocate(device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      cuda::get_device_zeroed_allocator().deallocate(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 978a1cb0a1..b49b5aeef8 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -67,7 +67,7 @@ inclusive_inplace(
                                               stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
@@ -78,7 +78,7 @@ inclusive_inplace(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   cuda::launch(cuda_res, Async);
 
@@ -120,7 +120,7 @@ exclusive_inplace(
                                               stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
@@ -132,7 +132,7 @@ exclusive_inplace(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   cuda::launch(cuda_res, Async);
 
@@ -173,7 +173,7 @@ inclusive(
                                               stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
@@ -184,7 +184,7 @@ inclusive(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   cuda::launch(cuda_res, Async);
 
@@ -228,7 +228,7 @@ exclusive(
                                               stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
@@ -240,7 +240,7 @@ exclusive(
                                               len,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   cuda::launch(cuda_res, Async);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index e91831bbc8..9d5546503f 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -95,7 +95,7 @@ stable(
   int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
-  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
+  R* d_out = cuda::get_device_allocator().template allocate<R>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the begin buffer
@@ -113,7 +113,7 @@ stable(
                                               stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -125,7 +125,7 @@ stable(
                                               end_bit,
                                               stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   if (d_keys.Current() == d_out) {
 
@@ -133,7 +133,7 @@ stable(
     cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_out);
+  cuda::get_device_allocator().deallocate(d_out);
 
   cuda::launch(cuda_res, Async);
 
@@ -163,7 +163,7 @@ stable(
   int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
-  R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
+  R* d_out = cuda::get_device_allocator().template allocate<R>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the begin buffer
@@ -181,7 +181,7 @@ stable(
                                                         stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -193,7 +193,7 @@ stable(
                                                         end_bit,
                                                         stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   if (d_keys.Current() == d_out) {
 
@@ -201,7 +201,7 @@ stable(
     cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_out);
+  cuda::get_device_allocator().deallocate(d_out);
 
   cuda::launch(cuda_res, Async);
 
@@ -338,8 +338,8 @@ stable_pairs(
   int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
-  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
-  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
+  K* d_keys_out = cuda::get_device_allocator().template allocate<K>(len);
+  V* d_vals_out = cuda::get_device_allocator().template allocate<V>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the keys_begin and vals_begin buffers
@@ -359,7 +359,7 @@ stable_pairs(
                                                stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -372,7 +372,7 @@ stable_pairs(
                                                end_bit,
                                                stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   if (d_keys.Current() == d_keys_out) {
 
@@ -385,8 +385,8 @@ stable_pairs(
     cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_keys_out);
-  cuda::device_mempool_type::getInstance().free(d_vals_out);
+  cuda::get_device_allocator().deallocate(d_keys_out);
+  cuda::get_device_allocator().deallocate(d_vals_out);
 
   cuda::launch(cuda_res, Async);
 
@@ -420,8 +420,8 @@ stable_pairs(
   int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
-  K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
-  V* d_vals_out = cuda::device_mempool_type::getInstance().malloc<V>(len);
+  K* d_keys_out = cuda::get_device_allocator().template allocate<K>(len);
+  V* d_vals_out = cuda::get_device_allocator().template allocate<V>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the keys_begin and vals_begin buffers
@@ -441,7 +441,7 @@ stable_pairs(
                                                          stream));
   // Allocate temporary storage
   d_temp_storage =
-      cuda::device_mempool_type::getInstance().malloc<unsigned char>(
+      cuda::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -454,7 +454,7 @@ stable_pairs(
                                                          end_bit,
                                                          stream));
   // Free temporary storage
-  cuda::device_mempool_type::getInstance().free(d_temp_storage);
+  cuda::get_device_allocator().deallocate(d_temp_storage);
 
   if (d_keys.Current() == d_keys_out) {
 
@@ -467,8 +467,8 @@ stable_pairs(
     cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
   }
 
-  cuda::device_mempool_type::getInstance().free(d_keys_out);
-  cuda::device_mempool_type::getInstance().free(d_vals_out);
+  cuda::get_device_allocator().deallocate(d_keys_out);
+  cuda::get_device_allocator().deallocate(d_vals_out);
 
   cuda::launch(cuda_res, Async);
 
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 58f36946e1..5f32b83b94 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -28,8 +28,10 @@
 #include <cstdio>
 #include <type_traits>
 #include <unordered_map>
+#include <memory>
 
-#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/Allocator.hpp"
+#include "RAJA/util/AllocatorPool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/macros.hpp"
@@ -49,70 +51,175 @@ namespace RAJA
 namespace hip
 {
 
+namespace detail
+{
 
-//! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+//! Allocator for device memory for use in basic_mempool
+struct DeviceBaseAllocator {
 
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes)
+  void* allocate(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipHostMalloc(&ptr, nbytes, hipHostMallocMapped));
+    hipErrchk(hipMalloc(&ptr, nbytes));
     return ptr;
   }
 
-  // returns true on success, false on failure
-  bool free(void* ptr)
+  void deallocate(void* ptr)
   {
-    hipErrchk(hipHostFree(ptr));
-    return true;
+    hipErrchk(hipFree(ptr));
   }
 };
 
-//! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+//! Allocator for pre-zeroed device memory for use in basic_mempool
+//  Note: Memory must be zero when returned to mempool
+struct DeviceZeroedBaseAllocator {
 
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes)
+  void* allocate(size_t nbytes)
   {
     void* ptr;
     hipErrchk(hipMalloc(&ptr, nbytes));
+    hipErrchk(hipMemset(ptr, 0, nbytes));
     return ptr;
   }
 
-  // returns true on success, false on failure
-  bool free(void* ptr)
+  void deallocate(void* ptr)
   {
     hipErrchk(hipFree(ptr));
-    return true;
   }
 };
 
-//! Allocator for pre-zeroed device memory for use in basic_mempool
-//  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+//! Allocator for pinned memory for use in basic_mempool
+struct PinnedBaseAllocator {
 
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes)
+  void* allocate(size_t nbytes)
   {
     void* ptr;
-    hipErrchk(hipMalloc(&ptr, nbytes));
-    hipErrchk(hipMemset(ptr, 0, nbytes));
+    hipErrchk(hipHostMalloc(&ptr, nbytes, hipHostMallocMapped));
     return ptr;
   }
 
-  // returns true on success, false on failure
-  bool free(void* ptr)
+  void deallocate(void* ptr)
   {
-    hipErrchk(hipFree(ptr));
-    return true;
+    hipErrchk(hipHostFree(ptr));
   }
 };
 
-using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
-using device_zeroed_mempool_type =
-    basic_mempool::MemPool<DeviceZeroedAllocator>;
-using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
+//! Make default allocators used by RAJA internally
+inline std::unique_ptr<RAJA::Allocator> make_default_device_allocator()
+{
+  return std::unique_ptr<RAJA::Allocator>(
+      new AllocatorPool<DeviceBaseAllocator>(
+        std::string("RAJA::hip::default_device_allocator"),
+        DeviceBaseAllocator()));
+}
+///
+inline std::unique_ptr<RAJA::Allocator> make_default_device_zeroed_allocator()
+{
+  return std::unique_ptr<RAJA::Allocator>(
+      new AllocatorPool<DeviceZeroedBaseAllocator>(
+        std::string("RAJA::hip::default_device_zeroed_allocator"),
+        DeviceZeroedBaseAllocator()));
+}
+///
+inline std::unique_ptr<RAJA::Allocator> make_default_pinned_allocator()
+{
+  return std::unique_ptr<RAJA::Allocator>(
+      new AllocatorPool<PinnedBaseAllocator>(
+        std::string("RAJA::hip::default_pinned_allocator"),
+        PinnedBaseAllocator()));
+}
+
+//! Storage for allocators used by RAJA internally
+inline std::unique_ptr<RAJA::Allocator>& get_device_allocator()
+{
+  static std::unique_ptr<RAJA::Allocator> allocator(
+      make_default_device_allocator());
+  return allocator;
+}
+///
+inline std::unique_ptr<RAJA::Allocator>& get_device_zeroed_allocator()
+{
+  static std::unique_ptr<RAJA::Allocator> allocator(
+      make_default_device_zeroed_allocator());
+  return allocator;
+}
+///
+inline std::unique_ptr<RAJA::Allocator>& get_pinned_allocator()
+{
+  static std::unique_ptr<RAJA::Allocator> allocator(
+      make_default_pinned_allocator());
+  return allocator;
+}
+
+} // namespace detail
+
+//! Sets the allocator used by RAJA internally by making an allocator of
+//  allocator_type with the given arguments. It is an error to change the
+//  allocator when any memory is allocated. This routine is not thread safe.
+template < typename allocator_type, typename ... Args >
+inline void set_device_allocator(Args&&... args)
+{
+  detail::get_device_allocator().release();
+  detail::get_device_allocator().reset(
+      new allocator_type(std::forward<Args>(args)...));
+}
+///
+template < typename allocator_type, typename ... Args >
+inline void set_device_zeroed_allocator(Args&&... args)
+{
+  detail::get_device_zeroed_allocator().release();
+  detail::get_device_zeroed_allocator().reset(
+      new allocator_type(std::forward<Args>(args)...));
+}
+///
+template < typename allocator_type, typename ... Args >
+inline void set_pinned_allocator(Args&&... args)
+{
+  detail::get_pinned_allocator().release();
+  detail::get_pinned_allocator().reset(
+      new allocator_type(std::forward<Args>(args)...));
+}
+
+//! Reset the allocator used by RAJA internally. This will destroy any existing
+//  allocator and replace it with the kind of allocator used by default. It is
+//  an error to change the allocator when any memory is allocated. This routine
+//  is not thread safe.
+inline void reset_device_allocator()
+{
+  detail::get_device_allocator().release();
+  detail::get_device_allocator() =
+      detail::make_default_device_allocator();
+}
+inline void reset_device_zeroed_allocator()
+{
+  detail::get_device_zeroed_allocator().release();
+  detail::get_device_zeroed_allocator() =
+      detail::make_default_device_zeroed_allocator();
+}
+inline void reset_pinned_allocator()
+{
+  detail::get_pinned_allocator().release();
+  detail::get_pinned_allocator() =
+      detail::make_default_pinned_allocator();
+}
+
+//! Gets the allocator used by RAJA internally. This allows the user to query
+//  the memory stats of the allocator.
+inline RAJA::Allocator& get_device_allocator()
+{
+  return *detail::get_device_allocator();
+}
+///
+inline RAJA::Allocator& get_device_zeroed_allocator()
+{
+  return *detail::get_device_zeroed_allocator();
+}
+///
+inline RAJA::Allocator& get_pinned_allocator()
+{
+  return *detail::get_pinned_allocator();
+}
+
 
 namespace detail
 {
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index d37b0fd9ff..4ab816a5d8 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -32,7 +32,7 @@
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/SoAArray.hpp"
 #include "RAJA/util/SoAPtr.hpp"
-#include "RAJA/util/basic_mempool.hpp"
+#include "RAJA/util/Allocator.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -567,7 +567,7 @@ class PinnedTally
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = hip::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    Node* n = hip::get_pinned_allocator().template allocate<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
     return &n->value;
@@ -590,7 +590,7 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        hip::pinned_mempool_type::getInstance().free(n);
+        hip::get_pinned_allocator().deallocate(n);
       }
       resource_list = rn->next;
       free(rn);
@@ -623,7 +623,7 @@ struct Reduce_Data {
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
+  RAJA::detail::SoAPtr<T> device;
   bool own_device_ptr;
 
   Reduce_Data() : Reduce_Data(T(), T()){};
@@ -683,9 +683,9 @@ struct Reduce_Data {
     if (act) {
       hip_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device.allocate(hip::get_device_allocator(), numBlocks);
+      device_count = hip::get_device_zeroed_allocator()
+                         .template allocate<unsigned int>(1);
       own_device_ptr = true;
     }
     return act;
@@ -697,8 +697,8 @@ struct Reduce_Data {
   {
     bool act = own_device_ptr;
     if (act) {
-      device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      device.deallocate(hip::get_device_allocator());
+      hip::get_device_zeroed_allocator().deallocate(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -769,9 +769,9 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device = hip::get_device_allocator().template allocate<T>(1);
+      device_count = hip::get_device_zeroed_allocator()
+                         .template allocate<unsigned int>(1);
       own_device_ptr = true;
     }
     return act;
@@ -783,9 +783,9 @@ struct ReduceAtomic_Data {
   {
     bool act = own_device_ptr;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      hip::get_device_allocator().deallocate(device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      hip::get_device_zeroed_allocator().deallocate(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 85bc494abb..18c2732175 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -83,7 +83,7 @@ inclusive_inplace(
 
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
@@ -104,7 +104,7 @@ inclusive_inplace(
                                              stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   hip::launch(hip_res, Async);
 
@@ -157,7 +157,7 @@ exclusive_inplace(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
@@ -180,7 +180,7 @@ exclusive_inplace(
                                              stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   hip::launch(hip_res, Async);
 
@@ -231,7 +231,7 @@ inclusive(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
@@ -252,7 +252,7 @@ inclusive(
                                              stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   hip::launch(hip_res, Async);
 
@@ -307,7 +307,7 @@ exclusive(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
@@ -330,7 +330,7 @@ exclusive(
                                              stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   hip::launch(hip_res, Async);
 
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index 9090721ff5..49a2e95457 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -122,7 +122,7 @@ stable(
   int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
-  R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
+  R* d_out = hip::get_device_allocator().template allocate<R>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the begin buffer
@@ -150,7 +150,7 @@ stable(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -172,7 +172,7 @@ stable(
                                               stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   if (detail::get_current(d_keys) == d_out) {
 
@@ -180,7 +180,7 @@ stable(
     hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
   }
 
-  hip::device_mempool_type::getInstance().free(d_out);
+  hip::get_device_allocator().deallocate(d_out);
 
   hip::launch(hip_res, Async);
 
@@ -210,7 +210,7 @@ stable(
   int end_bit=sizeof(R)*CHAR_BIT;
 
   // Allocate temporary storage for the output array
-  R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
+  R* d_out = hip::get_device_allocator().template allocate<R>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the begin buffer
@@ -238,7 +238,7 @@ stable(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -260,7 +260,7 @@ stable(
                                                         stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   if (detail::get_current(d_keys) == d_out) {
 
@@ -268,7 +268,7 @@ stable(
     hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
   }
 
-  hip::device_mempool_type::getInstance().free(d_out);
+  hip::get_device_allocator().deallocate(d_out);
 
   hip::launch(hip_res, Async);
 
@@ -403,8 +403,8 @@ stable_pairs(
   int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
-  K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
-  V* d_vals_out = hip::device_mempool_type::getInstance().malloc<V>(len);
+  K* d_keys_out = hip::get_device_allocator().template allocate<K>(len);
+  V* d_vals_out = hip::get_device_allocator().template allocate<V>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the keys_begin and vals_begin buffers
@@ -435,7 +435,7 @@ stable_pairs(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -459,7 +459,7 @@ stable_pairs(
                                                stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   if (detail::get_current(d_keys) == d_keys_out) {
 
@@ -472,8 +472,8 @@ stable_pairs(
     hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
   }
 
-  hip::device_mempool_type::getInstance().free(d_keys_out);
-  hip::device_mempool_type::getInstance().free(d_vals_out);
+  hip::get_device_allocator().deallocate(d_keys_out);
+  hip::get_device_allocator().deallocate(d_vals_out);
 
   hip::launch(hip_res, Async);
 
@@ -507,8 +507,8 @@ stable_pairs(
   int end_bit=sizeof(K)*CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
-  K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
-  V* d_vals_out = hip::device_mempool_type::getInstance().malloc<V>(len);
+  K* d_keys_out = hip::get_device_allocator().template allocate<K>(len);
+  V* d_vals_out = hip::get_device_allocator().template allocate<V>(len);
 
   // use cub double buffer to reduce temporary memory requirements
   // by allowing cub to write to the keys_begin and vals_begin buffers
@@ -539,7 +539,7 @@ stable_pairs(
 #endif
   // Allocate temporary storage
   d_temp_storage =
-      hip::device_mempool_type::getInstance().malloc<unsigned char>(
+      hip::get_device_allocator().template allocate<unsigned char>(
           temp_storage_bytes);
 
   // Run
@@ -563,7 +563,7 @@ stable_pairs(
                                                          stream));
 #endif
   // Free temporary storage
-  hip::device_mempool_type::getInstance().free(d_temp_storage);
+  hip::get_device_allocator().deallocate(d_temp_storage);
 
   if (detail::get_current(d_keys) == d_keys_out) {
 
@@ -576,8 +576,8 @@ stable_pairs(
     hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
   }
 
-  hip::device_mempool_type::getInstance().free(d_keys_out);
-  hip::device_mempool_type::getInstance().free(d_vals_out);
+  hip::get_device_allocator().deallocate(d_keys_out);
+  hip::get_device_allocator().deallocate(d_vals_out);
 
   hip::launch(hip_res, Async);
 
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index bca7c20550..ab4bb10c8c 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -31,7 +31,6 @@
 #include <type_traits>
 #include <unordered_map>
 
-#include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
 
diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp
new file mode 100644
index 0000000000..f0e226bfb7
--- /dev/null
+++ b/include/RAJA/util/Allocator.hpp
@@ -0,0 +1,100 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing an implementation of a memory pool.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_UTIL_ALLOCATOR_HPP
+#define RAJA_UTIL_ALLOCATOR_HPP
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+#include "RAJA/util/camp_aliases.hpp"
+
+namespace RAJA
+{
+
+/*! \class Allocator
+ ******************************************************************************
+ *
+ * \brief  Allocator Provides a generic interface for allocation in RAJA
+ *
+ * Allocator& device_pool = RAJA::cuda::get_device_allocator();
+ *
+ ******************************************************************************
+ */
+struct Allocator
+{
+  Allocator() = default;
+
+  // not copyable or movable
+  Allocator(Allocator const&) = delete;
+  Allocator(Allocator &&) = delete;
+  Allocator& operator=(Allocator const&) = delete;
+  Allocator& operator=(Allocator &&) = delete;
+
+  virtual ~Allocator() = default;
+
+  virtual void* allocate(size_t nbytes,
+                         size_t alignment = alignof(std::max_align_t)) = 0;
+
+  template <typename T>
+  inline T* allocate(size_t nitems,
+                     size_t alignment = alignof(T))
+  {
+    return static_cast<T*>(this->allocate(sizeof(T)*nitems, alignment));
+  }
+
+  virtual void deallocate(void* ptr) = 0;
+
+  virtual void release() = 0;
+
+  virtual size_t getHighWatermark() const noexcept = 0;
+
+  virtual size_t getCurrentSize() const noexcept = 0;
+
+  virtual size_t getActualSize() const noexcept = 0;
+
+  virtual size_t getAllocationCount() const noexcept = 0;
+
+  virtual const std::string& getName() const noexcept = 0;
+
+  // virtual Platform getPlatform() const noexcept = 0;
+};
+
+namespace detail
+{
+
+inline std::vector<Allocator*>& get_allocators()
+{
+  static std::vector<Allocator*> allocators;
+  return allocators;
+}
+
+} /* end namespace detail */
+
+/*!
+ * \brief Get the set of allocators used by RAJA internally
+ */
+inline std::vector<Allocator*> get_allocators()
+{
+  return detail::get_allocators();
+}
+
+} /* end namespace RAJA */
+
+
+#endif /* RAJA_UTIL_ALLOCATOR_HPP */
diff --git a/include/RAJA/util/AllocatorPool.hpp b/include/RAJA/util/AllocatorPool.hpp
new file mode 100644
index 0000000000..1c8405723a
--- /dev/null
+++ b/include/RAJA/util/AllocatorPool.hpp
@@ -0,0 +1,518 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing an implementation of a memory pool.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_UTIL_ALLOCATORPOOL_HPP
+#define RAJA_UTIL_ALLOCATORPOOL_HPP
+
+#include "RAJA/config.hpp"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <list>
+#include <map>
+#include <string>
+
+#include "RAJA/util/align.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/Allocator.hpp"
+
+// Note that this header is not included in RAJA.hpp
+// to avoid this warning when using openmp enabled headers with
+// non-openmp compilers
+#if defined(RAJA_ENABLE_OPENMP) && !defined(_OPENMP)
+#error RAJA configured with ENABLE_OPENMP, but OpenMP not supported by current compiler
+#endif
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+//! example allocator for AllocatorPool using allocate/deallocate
+struct HostBaseAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* allocate(std::size_t nbytes)
+  {
+    return std::malloc(nbytes);
+  }
+
+  // returns true on success, false on failure
+  void deallocate(void* ptr)
+  {
+    std::free(ptr);
+  }
+};
+
+//! Class abstracting a range in memory
+struct MemoryChunk {
+  void* begin = nullptr;
+  void* end   = nullptr;
+
+  MemoryChunk() = default;
+
+  MemoryChunk(void* ptr, size_t nbytes)
+    : begin(ptr)
+    , end(static_cast<char*>(begin) + nbytes)
+  { }
+
+  MemoryChunk(void* begin_, void* end_)
+    : begin(begin_)
+    , end(end_)
+  { }
+
+  size_t nbytes() const
+  {
+    return static_cast<char*>(end) -
+           static_cast<char*>(begin);
+  }
+
+  explicit operator bool() const
+  {
+    return begin != nullptr;
+  }
+};
+
+/*! \class MemoryArena
+ ******************************************************************************
+ *
+ * \brief  MemoryArena is a map based subclass for class Allocator
+ * provides book-keeping to divy a large chunk of pre-allocated memory to avoid
+ * the overhead of  malloc/free or cudaMalloc/cudaFree, etc
+ *
+ * get/give are the primary calls used by class Allocator to get aligned memory
+ * from the pool or give it back
+ *
+ *
+ ******************************************************************************
+ */
+struct MemoryArena
+{
+  using free_type = std::map<void*, void*>;
+  using free_value_type = typename free_type::value_type;
+  using used_type = std::map<void*, void*>;
+  using used_value_type = typename used_type::value_type;
+
+
+  MemoryArena(MemoryChunk mem)
+    : m_allocation(mem)
+  {
+    if (m_allocation.begin == nullptr) {
+      RAJA_ABORT_OR_THROW("RAJA::detail::MemoryArena Attempt to create with no memory");
+    }
+    m_free_space[m_allocation.begin] = m_allocation.end ;
+  }
+
+  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena& operator=(MemoryArena const&) = delete;
+
+  MemoryArena(MemoryArena&&) = default;
+  MemoryArena& operator=(MemoryArena&&) = default;
+
+  size_t capacity() const
+  {
+    return m_allocation.nbytes();
+  }
+
+  bool unused() const
+  {
+    return m_used_space.empty();
+  }
+
+  MemoryChunk get_allocation()
+  {
+    return m_allocation;
+  }
+
+  MemoryChunk get(size_t nbytes, size_t alignment)
+  {
+    MemoryChunk mem;
+    if (capacity() >= nbytes) {
+      free_type::iterator end = m_free_space.end();
+      for (free_type::iterator iter = m_free_space.begin(); iter != end;
+           ++iter) {
+
+        void* adj_ptr = iter->first;
+        size_t cap =
+            static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
+
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+
+          mem = MemoryChunk(adj_ptr, nbytes);
+
+          remove_free_chunk(iter,
+                            mem);
+
+          add_used_chunk(mem);
+
+          break;
+        }
+      }
+    }
+    return mem;
+  }
+
+  MemoryChunk give(void* ptr)
+  {
+    MemoryChunk mem;
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
+
+      used_type::iterator found = m_used_space.find(ptr);
+
+      if (found != m_used_space.end()) {
+
+        mem = MemoryChunk(found->first, found->second);
+
+        add_free_chunk(mem);
+
+        m_used_space.erase(found);
+
+      } else {
+        RAJA_ABORT_OR_THROW("RAJA::detail::MemoryArena::give invalid ptr");
+      }
+    }
+    return mem;
+  }
+
+private:
+  void add_free_chunk(MemoryChunk mem)
+  {
+    // integrates a chunk of memory into free_space
+    free_type::iterator invl = m_free_space.end();
+    free_type::iterator next = m_free_space.lower_bound(mem.begin);
+
+    // check if prev exists
+    if (next != m_free_space.begin()) {
+      // check if prev can cover [begin, end)
+      free_type::iterator prev = next;
+      --prev;
+      if (prev->second == mem.begin) {
+        // extend prev to cover [begin, end)
+        prev->second = mem.end;
+
+        // check if prev can cover next too
+        if (next != invl) {
+          assert(next->first != mem.begin);
+
+          if (next->first == mem.end) {
+            // extend prev to cover next too
+            prev->second = next->second;
+
+            // remove redundant next
+            m_free_space.erase(next);
+          }
+        }
+        return;
+      }
+    }
+
+    if (next != invl) {
+      assert(next->first != mem.begin);
+
+      if (next->first == mem.end) {
+        // extend next to cover [begin, end)
+        m_free_space.insert(next, free_value_type{mem.begin, next->second});
+        m_free_space.erase(next);
+
+        return;
+      }
+    }
+
+    // no free space adjacent to this chunk, add separate free chunk [begin,
+    // end)
+    m_free_space.insert(next, free_value_type{mem.begin, mem.end});
+  }
+
+  void remove_free_chunk(free_type::iterator iter, MemoryChunk mem)
+  {
+    void* ptr = iter->first;
+    void* ptr_end = iter->second;
+
+    // fixup m_free_space, shrinking and adding chunks as needed
+    if (ptr != mem.begin) {
+
+      // shrink end of current free region to [ptr, begin)
+      iter->second = mem.begin;
+
+      if (mem.end != ptr_end) {
+
+        // insert free region [end, ptr_end) after current free region
+        free_type::iterator next = iter;
+        ++next;
+        m_free_space.insert(next, free_value_type{mem.end, ptr_end});
+      }
+
+    } else if (mem.end != ptr_end) {
+
+      // shrink beginning of current free region to [end, ptr_end)
+      free_type::iterator next = iter;
+      ++next;
+      m_free_space.insert(next, free_value_type{mem.end, ptr_end});
+      m_free_space.erase(iter);
+
+    } else {
+
+      // can not reuse current region, erase
+      m_free_space.erase(iter);
+    }
+  }
+
+  void add_used_chunk(MemoryChunk mem)
+  {
+    // simply inserts a chunk of memory into used_space
+    m_used_space.insert(used_value_type{mem.begin, mem.end});
+  }
+
+  MemoryChunk m_allocation;
+  free_type m_free_space;
+  used_type m_used_space;
+};
+
+} /* end namespace detail */
+
+
+/*! \class AllocatorPool
+ ******************************************************************************
+ *
+ * \brief  AllocatorPool provides a a RAJA::Allocator that is a basic memory
+ *  pool on top of the given allocator_type.
+ *
+ * This is used for RAJA's internal allocations by default, but a different
+ * Allocator can be set for specific backend allocators. For example
+ * RAJA::cuda::set_device_allocator allows the user to change the device
+ * allocator used by RAJA internally.
+ *
+ * AllocatorPool uses MemoryArena to do the heavy lifting of maintaining
+ * access to the used/free space.
+ *
+ * The following are some examples
+ *
+ * //! example allocator for AllocatorPool using allocate/deallocate
+ * struct host_allocator {
+ *
+ *   // returns a valid pointer on success, nullptr on failure
+ *   void* allocate(std::size_t nbytes)
+ *   {
+ *     return std::malloc(nbytes);
+ *   }
+ *
+ *   // returns true on success, false on failure
+ *   void deallocate(void* ptr)
+ *   {
+ *     std::free(ptr);
+ *   }
+ * };
+ *
+ * RAJA::Allocator* aloc =
+ *     new RAJA::AllocatorPool<host_allocator>();
+ *
+ ******************************************************************************
+ */
+template <typename allocator_t>
+struct AllocatorPool : Allocator
+{
+  using allocator_type = allocator_t;
+
+  static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull;
+
+  AllocatorPool(std::string const& name,
+                allocator_type const& aloc = allocator_type{},
+                size_t default_arena_size = default_default_arena_size)
+    : m_default_arena_size(default_arena_size)
+    , m_alloc(aloc)
+    , m_name(name) // std::string("RAJA::AllocatorPool<")+m_alloc.getName()+">")
+  {
+  }
+
+  // not copyable or movable
+  AllocatorPool(AllocatorPool const&) = delete;
+  AllocatorPool(AllocatorPool &&) = delete;
+  AllocatorPool& operator=(AllocatorPool const&) = delete;
+  AllocatorPool& operator=(AllocatorPool &&) = delete;
+
+  virtual ~AllocatorPool()
+  {
+    // When used with static storage duration, it is possible to encounter
+    // errors like cudaErrorCudartUnloading with cudaFree. So do not call
+    // release here to avoid potential cuda calls and errors.
+  }
+
+  void* allocate(size_t nbytes,
+                 size_t alignment = alignof(std::max_align_t)) final
+  {
+    if (nbytes == 0) return nullptr;
+
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    detail::MemoryChunk mem;
+
+    // find a usable memory chunk in an existing arena
+    arena_container_type::iterator end = m_arenas.end();
+    for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
+         ++iter) {
+      mem = iter->get(nbytes, alignment);
+      if (mem.begin != nullptr) {
+        break;
+      }
+    }
+
+    // allocate a new memory chunk
+    if (mem.begin == nullptr) {
+      const size_t alloc_size =
+          std::max(nbytes + alignment, m_default_arena_size);
+      detail::MemoryChunk arena_mem(m_alloc.allocate(alloc_size), alloc_size);
+      if (arena_mem.begin != nullptr) {
+        m_arenas.emplace_front(arena_mem);
+        m_actualSize += m_arenas.front().capacity();
+        mem = m_arenas.front().get(nbytes, alignment);
+      } else{
+        RAJA_ABORT_OR_THROW("RAJA::AllocatorPool::allocate arena allocation failed");
+      }
+    }
+
+    // update stats
+    m_currentSize += mem.nbytes();
+    if (m_currentSize > m_highWatermark) {
+      m_highWatermark = m_currentSize;
+    }
+    m_allocationCount += 1;
+
+    return mem.begin;
+  }
+
+  void deallocate(void* ptr) final
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    // find arena that owns ptr and return it
+    detail::MemoryChunk mem;
+    arena_container_type::iterator end = m_arenas.end();
+    for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
+         ++iter) {
+      if ( (mem = iter->give(ptr)) ) {
+        ptr = nullptr;
+        // update stats
+        m_currentSize -= mem.nbytes();
+        m_allocationCount -= 1;
+        break;
+      }
+    }
+
+    if (ptr != nullptr) {
+      RAJA_ABORT_OR_THROW("RAJA::AllocatorPool::deallocate unknown pointer");
+    }
+  }
+
+  void release() final
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    for (auto it = m_arenas.begin(); it != m_arenas.end(); /* do nothing */) {
+      if (it->unused()) {
+        // update stats
+        m_actualSize -= it->capacity();
+        // deallocate memory
+        detail::MemoryChunk mem = it->get_allocation();
+        m_alloc.deallocate(mem.begin);
+        // erase
+        it = m_arenas.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+
+  size_t get_arena_size() const
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    return m_default_arena_size;
+  }
+
+  size_t set_arena_size(size_t new_arena_size)
+  {
+#if defined(RAJA_ENABLE_OPENMP)
+    lock_guard<omp::mutex> lock(m_mutex);
+#endif
+
+    size_t prev_size = m_default_arena_size;
+    m_default_arena_size = new_arena_size;
+    return prev_size;
+  }
+
+  size_t getHighWatermark() const noexcept final
+  {
+    return m_highWatermark;
+  }
+
+  size_t getCurrentSize() const noexcept final
+  {
+    return m_currentSize;
+  }
+
+  size_t getActualSize() const noexcept final
+  {
+    return m_actualSize;
+  }
+
+  size_t getAllocationCount() const noexcept final
+  {
+    return m_allocationCount;
+  }
+
+  const std::string& getName() const noexcept final
+  {
+    return m_name;
+  }
+
+  // Platform getPlatform() const noexcept final
+  // {
+  //   return m_alloc.getPlatform();
+  // }
+
+private:
+  using arena_container_type = std::list<detail::MemoryArena>;
+
+  arena_container_type m_arenas;
+  size_t m_default_arena_size;
+  allocator_t m_alloc;
+  std::string m_name;
+
+  size_t m_highWatermark = 0;
+  size_t m_currentSize = 0;
+  size_t m_actualSize = 0;
+  size_t m_allocationCount = 0;
+
+#if defined(RAJA_ENABLE_OPENMP)
+  omp::mutex m_mutex;
+#endif
+};
+
+} /* end namespace RAJA */
+
+
+#endif /* RAJA_UTIL_ALLOCATORPOOL_HPP */
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 2c459c1683..b10587d84c 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -20,6 +20,8 @@
 
 #include "RAJA/config.hpp"
 
+#include "RAJA/util/Allocator.hpp"
+
 // for RAJA::reduce::detail::ValueLoc
 #include "RAJA/pattern/detail/reduce.hpp"
 
@@ -36,29 +38,27 @@ namespace detail
  * This is useful for creating a vectorizable data layout and getting
  * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
  */
-template <typename T,
-          typename mempool = RAJA::basic_mempool::MemPool<
-              RAJA::basic_mempool::generic_allocator> >
+template <typename T>
 class SoAPtr
 {
   using value_type = T;
 
 public:
   SoAPtr() = default;
-  explicit SoAPtr(size_t size)
-      : mem(mempool::getInstance().template malloc<value_type>(size))
+  SoAPtr(Allocator& allocator, size_t size)
+      : mem(allocator.template allocate<value_type>(size))
   {
   }
 
-  SoAPtr& allocate(size_t size)
+  SoAPtr& allocate(Allocator& allocator, size_t size)
   {
-    mem = mempool::getInstance().template malloc<value_type>(size);
+    mem = allocator.template allocate<value_type>(size);
     return *this;
   }
 
-  SoAPtr& deallocate()
+  SoAPtr& deallocate(Allocator& allocator)
   {
-    mempool::getInstance().free(mem);
+    allocator.deallocate(mem);
     mem = nullptr;
     return *this;
   }
@@ -75,8 +75,8 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
+template <typename T, typename IndexType, bool doing_min>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>>
 {
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
   using first_type = T;
@@ -84,24 +84,24 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
 
 public:
   SoAPtr() = default;
-  explicit SoAPtr(size_t size)
-      : mem(mempool::getInstance().template malloc<first_type>(size)),
-        mem_idx(mempool::getInstance().template malloc<second_type>(size))
+  SoAPtr(Allocator& allocator, size_t size)
+      : mem(allocator.template allocate<first_type>(size)),
+        mem_idx(allocator.template allocate<second_type>(size))
   {
   }
 
-  SoAPtr& allocate(size_t size)
+  SoAPtr& allocate(Allocator& allocator, size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
-    mem_idx = mempool::getInstance().template malloc<second_type>(size);
+    mem = allocator.template allocate<first_type>(size);
+    mem_idx = allocator.template allocate<second_type>(size);
     return *this;
   }
 
-  SoAPtr& deallocate()
+  SoAPtr& deallocate(Allocator& allocator)
   {
-    mempool::getInstance().free(mem);
+    allocator.deallocate(mem);
     mem = nullptr;
-    mempool::getInstance().free(mem_idx);
+    allocator.deallocate(mem_idx);
     mem_idx = nullptr;
     return *this;
   }
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
deleted file mode 100644
index 517fb5a086..0000000000
--- a/include/RAJA/util/basic_mempool.hpp
+++ /dev/null
@@ -1,427 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   RAJA header file containing an implementation of a memory pool.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_BASIC_MEMPOOL_HPP
-#define RAJA_BASIC_MEMPOOL_HPP
-
-#include <cassert>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <list>
-#include <map>
-
-#include "RAJA/util/align.hpp"
-#include "RAJA/util/mutex.hpp"
-
-namespace RAJA
-{
-
-namespace basic_mempool
-{
-
-namespace detail
-{
-
-
-/*! \class MemoryArena
- ******************************************************************************
- *
- * \brief  MemoryArena is a map based subclass for class MemPool
- * provides book-keeping to divy a large chunk of pre-allocated memory to avoid
- * the overhead of  malloc/free or cudaMalloc/cudaFree, etc
- *
- * get/give are the primary calls used by class MemPool to get aligned memory
- * from the pool or give it back
- *
- *
- ******************************************************************************
- */
-class MemoryArena
-{
-public:
-  using free_type = std::map<void*, void*>;
-  using free_value_type = typename free_type::value_type;
-  using used_type = std::map<void*, void*>;
-  using used_value_type = typename used_type::value_type;
-
-  MemoryArena(void* ptr, size_t size)
-    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
-      m_free_space(),
-      m_used_space()
-  {
-     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
-    if (m_allocation.begin == nullptr) {
-      fprintf(stderr, "Attempt to create MemoryArena with no memory");
-      std::abort();
-    }
-  }
-
-  MemoryArena(MemoryArena const&) = delete;
-  MemoryArena& operator=(MemoryArena const&) = delete;
-
-  MemoryArena(MemoryArena&&) = default;
-  MemoryArena& operator=(MemoryArena&&) = default;
-
-  size_t capacity()
-  {
-    return static_cast<char*>(m_allocation.end) -
-           static_cast<char*>(m_allocation.begin);
-  }
-
-  bool unused() { return m_used_space.empty(); }
-
-  void* get_allocation() { return m_allocation.begin; }
-
-  void* get(size_t nbytes, size_t alignment)
-  {
-    void* ptr_out = nullptr;
-    if (capacity() >= nbytes) {
-      free_type::iterator end = m_free_space.end();
-      for (free_type::iterator iter = m_free_space.begin(); iter != end;
-           ++iter) {
-
-        void* adj_ptr = iter->first;
-        size_t cap =
-            static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
-
-        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
-
-          ptr_out = adj_ptr;
-
-          remove_free_chunk(iter,
-                            adj_ptr,
-                            static_cast<char*>(adj_ptr) + nbytes);
-
-          add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
-
-          break;
-        }
-      }
-    }
-    return ptr_out;
-  }
-
-  bool give(void* ptr)
-  {
-    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
-
-      used_type::iterator found = m_used_space.find(ptr);
-
-      if (found != m_used_space.end()) {
-
-        add_free_chunk(found->first, found->second);
-
-        m_used_space.erase(found);
-
-      } else {
-        fprintf(stderr, "Invalid free %p", ptr);
-        std::abort();
-      }
-
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-private:
-  struct memory_chunk {
-    void* begin;
-    void* end;
-  };
-
-  void add_free_chunk(void* begin, void* end)
-  {
-    // integrates a chunk of memory into free_space
-    free_type::iterator invl = m_free_space.end();
-    free_type::iterator next = m_free_space.lower_bound(begin);
-
-    // check if prev exists
-    if (next != m_free_space.begin()) {
-      // check if prev can cover [begin, end)
-      free_type::iterator prev = next;
-      --prev;
-      if (prev->second == begin) {
-        // extend prev to cover [begin, end)
-        prev->second = end;
-
-        // check if prev can cover next too
-        if (next != invl) {
-          assert(next->first != begin);
-
-          if (next->first == end) {
-            // extend prev to cover next too
-            prev->second = next->second;
-
-            // remove redundant next
-            m_free_space.erase(next);
-          }
-        }
-        return;
-      }
-    }
-
-    if (next != invl) {
-      assert(next->first != begin);
-
-      if (next->first == end) {
-        // extend next to cover [begin, end)
-        m_free_space.insert(next, free_value_type{begin, next->second});
-        m_free_space.erase(next);
-
-        return;
-      }
-    }
-
-    // no free space adjacent to this chunk, add seperate free chunk [begin,
-    // end)
-    m_free_space.insert(next, free_value_type{begin, end});
-  }
-
-  void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
-  {
-
-    void* ptr = iter->first;
-    void* ptr_end = iter->second;
-
-    // fixup m_free_space, shrinking and adding chunks as needed
-    if (ptr != begin) {
-
-      // shrink end of current free region to [ptr, begin)
-      iter->second = begin;
-
-      if (end != ptr_end) {
-
-        // insert free region [end, ptr_end) after current free region
-        free_type::iterator next = iter;
-        ++next;
-        m_free_space.insert(next, free_value_type{end, ptr_end});
-      }
-
-    } else if (end != ptr_end) {
-
-      // shrink beginning of current free region to [end, ptr_end)
-      free_type::iterator next = iter;
-      ++next;
-      m_free_space.insert(next, free_value_type{end, ptr_end});
-      m_free_space.erase(iter);
-
-    } else {
-
-      // can not reuse current region, erase
-      m_free_space.erase(iter);
-    }
-  }
-
-  void add_used_chunk(void* begin, void* end)
-  {
-    // simply inserts a chunk of memory into used_space
-    m_used_space.insert(used_value_type{begin, end});
-  }
-
-  memory_chunk m_allocation;
-  free_type m_free_space;
-  used_type m_used_space;
-};
-
-} /* end namespace detail */
-
-
-/*! \class MemPool
- ******************************************************************************
- *
- * \brief  MemPool pre-allocates a large chunk of memory and provides generic
- * malloc/free for the user to allocate aligned data within the pool
- *
- * MemPool uses MemoryArena to do the heavy lifting of maintaining access to
- * the used/free space.
- *
- * MemPool provides an example generic_allocator which can guide more
- *specialized
- * allocators. The following are some examples
- *
- * using device_mempool_type = basic_mempool::MemPool<cuda::DeviceAllocator>;
- * using device_zeroed_mempool_type =
- *basic_mempool::MemPool<cuda::DeviceZeroedAllocator>;
- * using pinned_mempool_type = basic_mempool::MemPool<cuda::PinnedAllocator>;
- *
- * The user provides the specialized allocator, for example :
- * struct DeviceAllocator {
- *
- *  // returns a valid pointer on success, nullptr on failure
- *  void* malloc(size_t nbytes)
- *  {
- *    void* ptr;
- *    cudaErrchk(cudaMalloc(&ptr, nbytes));
- *    return ptr;
- *  }
- *
- *  // returns true on success, false on failure
- *  bool free(void* ptr)
- *  {
- *    cudaErrchk(cudaFree(ptr));
- *    return true;
- *  }
- * };
- *
- *
- ******************************************************************************
- */
-template <typename allocator_t>
-class MemPool
-{
-public:
-  using allocator_type = allocator_t;
-
-  static inline MemPool<allocator_t>& getInstance()
-  {
-    static MemPool<allocator_t> pool{};
-    return pool;
-  }
-
-  static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull;
-
-  MemPool()
-      : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc()
-  {
-  }
-
-  ~MemPool()
-  {
-    // With static objects like MemPool, cudaErrorCudartUnloading is a possible
-    // error with cudaFree
-    // So no more cuda calls here
-  }
-
-
-  void free_chunks()
-  {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
-
-    while (!m_arenas.empty()) {
-      void* allocation_ptr = m_arenas.front().get_allocation();
-      m_alloc.free(allocation_ptr);
-      m_arenas.pop_front();
-    }
-  }
-
-  size_t arena_size()
-  {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
-
-    return m_default_arena_size;
-  }
-
-  size_t arena_size(size_t new_size)
-  {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
-
-    size_t prev_size = m_default_arena_size;
-    m_default_arena_size = new_size;
-    return prev_size;
-  }
-
-  template <typename T>
-  T* malloc(size_t nTs, size_t alignment = alignof(T))
-  {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
-
-    const size_t size = nTs * sizeof(T);
-    void* ptr = nullptr;
-    arena_container_type::iterator end = m_arenas.end();
-    for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
-      ptr = iter->get(size, alignment);
-      if (ptr != nullptr) {
-        break;
-      }
-    }
-
-    if (ptr == nullptr) {
-      const size_t alloc_size =
-          std::max(size + alignment, m_default_arena_size);
-      void* arena_ptr = m_alloc.malloc(alloc_size);
-      if (arena_ptr != nullptr) {
-        m_arenas.emplace_front(arena_ptr, alloc_size);
-        ptr = m_arenas.front().get(size, alignment);
-      }
-    }
-
-    return static_cast<T*>(ptr);
-  }
-
-  void free(const void* cptr)
-  {
-#if defined(RAJA_ENABLE_OPENMP)
-    lock_guard<omp::mutex> lock(m_mutex);
-#endif
-
-    void* ptr = const_cast<void*>(cptr);
-    arena_container_type::iterator end = m_arenas.end();
-    for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
-      if (iter->give(ptr)) {
-        ptr = nullptr;
-        break;
-      }
-    }
-    if (ptr != nullptr) {
-      fprintf(stderr, "Unknown pointer %p", ptr);
-    }
-  }
-
-private:
-  using arena_container_type = std::list<detail::MemoryArena>;
-
-#if defined(RAJA_ENABLE_OPENMP)
-  omp::mutex m_mutex;
-#endif
-
-  arena_container_type m_arenas;
-  size_t m_default_arena_size;
-  allocator_t m_alloc;
-};
-
-//! example allocator for basic_mempool using malloc/free
-struct generic_allocator {
-
-  // returns a valid pointer on success, nullptr on failure
-  void* malloc(size_t nbytes) { return std::malloc(nbytes); }
-
-  // returns true on success, false on failure
-  bool free(void* ptr)
-  {
-    std::free(ptr);
-    return true;
-  }
-};
-
-} /* end namespace basic_mempool */
-
-} /* end namespace RAJA */
-
-
-#endif /* RAJA_BASIC_MEMPOOL_HPP */

From b85f98c4282b51abb5b8fc5336c6ffea30c2346f Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 12 Oct 2021 23:20:46 -0700
Subject: [PATCH 2/7] Add missing include in tut_sort

---
 examples/tut_sort.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tut_sort.cpp b/examples/tut_sort.cpp
index 3eeb5cc575..a856b7236a 100644
--- a/examples/tut_sort.cpp
+++ b/examples/tut_sort.cpp
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <numeric>
 #include <random>
+#include <list>
 #include <map>
 #include <unordered_map>
 #include <set>

From 49b16e428f13fad72e81ad63db4e6005dbc3be9f Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 12 Oct 2021 23:47:25 -0700
Subject: [PATCH 3/7] Register and deregister Allocators from list

---
 include/RAJA/util/Allocator.hpp | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp
index f0e226bfb7..93beed450d 100644
--- a/include/RAJA/util/Allocator.hpp
+++ b/include/RAJA/util/Allocator.hpp
@@ -27,6 +27,15 @@
 namespace RAJA
 {
 
+struct Allocator;
+
+namespace detail
+{
+
+inline std::vector<Allocator*>& get_allocators();
+
+} /* end namespace detail */
+
 /*! \class Allocator
  ******************************************************************************
  *
@@ -38,7 +47,10 @@ namespace RAJA
  */
 struct Allocator
 {
-  Allocator() = default;
+  Allocator()
+  {
+    detail::get_allocators().emplace_back(this);
+  }
 
   // not copyable or movable
   Allocator(Allocator const&) = delete;
@@ -46,7 +58,18 @@ struct Allocator
   Allocator& operator=(Allocator const&) = delete;
   Allocator& operator=(Allocator &&) = delete;
 
-  virtual ~Allocator() = default;
+  virtual ~Allocator()
+  {
+    auto& allocators = detail::get_allocators();
+    for (auto iter = allocators.cbegin();
+         iter != allocators.cend();
+         ++iter) {
+      if (this == *iter) {
+        allocators.erase(iter);
+        break;
+      }
+    }
+  }
 
   virtual void* allocate(size_t nbytes,
                          size_t alignment = alignof(std::max_align_t)) = 0;

From 0c393cdeb26a3d44cce7c87c0409f85dc9574b89 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 12 Oct 2021 23:47:45 -0700
Subject: [PATCH 4/7] Return allocators vector by const ref

---
 include/RAJA/util/Allocator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp
index 93beed450d..921b816b8a 100644
--- a/include/RAJA/util/Allocator.hpp
+++ b/include/RAJA/util/Allocator.hpp
@@ -112,7 +112,7 @@ inline std::vector<Allocator*>& get_allocators()
 /*!
  * \brief Get the set of allocators used by RAJA internally
  */
-inline std::vector<Allocator*> get_allocators()
+inline std::vector<Allocator*> const& get_allocators()
 {
   return detail::get_allocators();
 }

From 0287c8efa77f6e2456f20706c4cb986fc0b5cf2a Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 12 Oct 2021 23:47:59 -0700
Subject: [PATCH 5/7] Fix compile error in AllocatorPool

---
 include/RAJA/util/AllocatorPool.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/util/AllocatorPool.hpp b/include/RAJA/util/AllocatorPool.hpp
index 1c8405723a..e12dce00da 100644
--- a/include/RAJA/util/AllocatorPool.hpp
+++ b/include/RAJA/util/AllocatorPool.hpp
@@ -444,7 +444,7 @@ struct AllocatorPool : Allocator
     }
   }
 
-  size_t get_arena_size() const
+  size_t get_arena_size()
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);

From e5d6acab61220d534ea0127e600bccabaf5724b8 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 29 Oct 2021 13:40:07 -0700
Subject: [PATCH 6/7] Add AllocatorWithStats

This is a layer under Allocator that can provide stats.
The idea is to allow users to expose stats for their
allocators or not.
---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp |  3 +++
 include/RAJA/policy/hip/MemUtils_HIP.hpp   |  3 +++
 include/RAJA/util/Allocator.hpp            | 28 +++++++++++++++++++---
 include/RAJA/util/AllocatorPool.hpp        | 13 ++++++----
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index b7275d044c..fb74e35680 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -107,6 +107,7 @@ inline std::unique_ptr<RAJA::Allocator> make_default_device_allocator()
   return std::unique_ptr<RAJA::Allocator>(
       new AllocatorPool<DeviceBaseAllocator>(
         std::string("RAJA::cuda::default_device_allocator"),
+        Platform::cuda,
         DeviceBaseAllocator()));
 }
 ///
@@ -115,6 +116,7 @@ inline std::unique_ptr<RAJA::Allocator> make_default_device_zeroed_allocator()
   return std::unique_ptr<RAJA::Allocator>(
       new AllocatorPool<DeviceZeroedBaseAllocator>(
         std::string("RAJA::cuda::default_device_zeroed_allocator"),
+        Platform::cuda,
         DeviceZeroedBaseAllocator()));
 }
 ///
@@ -123,6 +125,7 @@ inline std::unique_ptr<RAJA::Allocator> make_default_pinned_allocator()
   return std::unique_ptr<RAJA::Allocator>(
       new AllocatorPool<PinnedBaseAllocator>(
         std::string("RAJA::cuda::default_pinned_allocator"),
+        Platform::cuda,
         PinnedBaseAllocator()));
 }
 
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 5f32b83b94..34a1894926 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -110,6 +110,7 @@ inline std::unique_ptr<RAJA::Allocator> make_default_device_allocator()
   return std::unique_ptr<RAJA::Allocator>(
       new AllocatorPool<DeviceBaseAllocator>(
         std::string("RAJA::hip::default_device_allocator"),
+        Platform::hip,
         DeviceBaseAllocator()));
 }
 ///
@@ -118,6 +119,7 @@ inline std::unique_ptr<RAJA::Allocator> make_default_device_zeroed_allocator()
   return std::unique_ptr<RAJA::Allocator>(
       new AllocatorPool<DeviceZeroedBaseAllocator>(
         std::string("RAJA::hip::default_device_zeroed_allocator"),
+        Platform::hip,
         DeviceZeroedBaseAllocator()));
 }
 ///
@@ -126,6 +128,7 @@ inline std::unique_ptr<RAJA::Allocator> make_default_pinned_allocator()
   return std::unique_ptr<RAJA::Allocator>(
       new AllocatorPool<PinnedBaseAllocator>(
         std::string("RAJA::hip::default_pinned_allocator"),
+        Platform::hip,
         PinnedBaseAllocator()));
 }
 
diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp
index 921b816b8a..38b525f637 100644
--- a/include/RAJA/util/Allocator.hpp
+++ b/include/RAJA/util/Allocator.hpp
@@ -85,6 +85,30 @@ struct Allocator
 
   virtual void release() = 0;
 
+  virtual const std::string& getName() const noexcept = 0;
+};
+
+
+/*! \class AllocatorWithStats
+ ******************************************************************************
+ *
+ * \brief  AllocatorWithStats Provides a generic interface for allocation and
+ *         getting allocation statistics in RAJA
+ *
+ ******************************************************************************
+ */
+struct AllocatorWithStats : Allocator
+{
+  AllocatorWithStats() = default;
+
+  // not copyable or movable
+  AllocatorWithStats(AllocatorWithStats const&) = delete;
+  AllocatorWithStats(AllocatorWithStats &&) = delete;
+  AllocatorWithStats& operator=(AllocatorWithStats const&) = delete;
+  AllocatorWithStats& operator=(AllocatorWithStats &&) = delete;
+
+  virtual ~AllocatorWithStats() = default;
+
   virtual size_t getHighWatermark() const noexcept = 0;
 
   virtual size_t getCurrentSize() const noexcept = 0;
@@ -93,9 +117,7 @@ struct Allocator
 
   virtual size_t getAllocationCount() const noexcept = 0;
 
-  virtual const std::string& getName() const noexcept = 0;
-
-  // virtual Platform getPlatform() const noexcept = 0;
+  virtual Platform getPlatform() const noexcept = 0;
 };
 
 namespace detail
diff --git a/include/RAJA/util/AllocatorPool.hpp b/include/RAJA/util/AllocatorPool.hpp
index e12dce00da..a75d5b80ad 100644
--- a/include/RAJA/util/AllocatorPool.hpp
+++ b/include/RAJA/util/AllocatorPool.hpp
@@ -325,18 +325,20 @@ struct MemoryArena
  ******************************************************************************
  */
 template <typename allocator_t>
-struct AllocatorPool : Allocator
+struct AllocatorPool : AllocatorWithStats
 {
   using allocator_type = allocator_t;
 
   static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull;
 
   AllocatorPool(std::string const& name,
+                Platform platform,
                 allocator_type const& aloc = allocator_type{},
                 size_t default_arena_size = default_default_arena_size)
     : m_default_arena_size(default_arena_size)
     , m_alloc(aloc)
     , m_name(name) // std::string("RAJA::AllocatorPool<")+m_alloc.getName()+">")
+    , m_platform(platform)
   {
   }
 
@@ -489,10 +491,10 @@ struct AllocatorPool : Allocator
     return m_name;
   }
 
-  // Platform getPlatform() const noexcept final
-  // {
-  //   return m_alloc.getPlatform();
-  // }
+  Platform getPlatform() const noexcept final
+  {
+    return m_platform;
+  }
 
 private:
   using arena_container_type = std::list<detail::MemoryArena>;
@@ -501,6 +503,7 @@ struct AllocatorPool : Allocator
   size_t m_default_arena_size;
   allocator_t m_alloc;
   std::string m_name;
+  Platform m_platform;
 
   size_t m_highWatermark = 0;
   size_t m_currentSize = 0;

From 58d4bcdf6978fb1b8c960ed4b076c7f9bf8f5272 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 24 Nov 2021 16:45:35 -0800
Subject: [PATCH 7/7] Add basic example

---
 examples/CMakeLists.txt     |   4 +
 examples/tut_allocators.cpp | 340 ++++++++++++++++++++++++++++++++++++
 2 files changed, 344 insertions(+)
 create mode 100644 examples/tut_allocators.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c7cbd1bd47..2ef0a0ed04 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -139,4 +139,8 @@ raja_add_executable(
   NAME resource-teams
   SOURCES resource-teams.cpp)
 
+raja_add_executable(
+  NAME tut_allocators
+  SOURCES tut_allocators.cpp)
+
 add_subdirectory(plugin)
diff --git a/examples/tut_allocators.cpp b/examples/tut_allocators.cpp
new file mode 100644
index 0000000000..87cac9ec32
--- /dev/null
+++ b/examples/tut_allocators.cpp
@@ -0,0 +1,340 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Reduction Example
+ *
+ *  This example illustrates use of the RAJA Allocators.
+ *
+ *  RAJA features shown:
+ *    -  Allocator types
+ *    - `forall` loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ *    -  Reduction types
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+const int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+const int HIP_BLOCK_SIZE = 256;
+#endif
+
+
+// Allocator class derived from RAJA::Allocator
+// using memoryManager allocations.
+struct ExampleAllocator : RAJA::Allocator
+{
+  // Allocators may take any constructor args as they are passed through the
+  // RAJA::*::set_*_allocator calls to the constructor.
+  ExampleAllocator(const std::string& name)
+    : m_name(name)
+  {
+    std::cout << "\t\t" << getName() << " constructor" << std::endl;
+  }
+
+  // Virtual destructor.
+  // Care should be taken as this may be called after main has returned.
+  virtual ~ExampleAllocator()
+  {
+    std::cout << "\t\t" << getName() << " destructor" << std::endl;
+  }
+
+  // Override the allocate method.
+  void* allocate(size_t nbytes,
+                 size_t alignment) override
+  {
+    std::cout << "\t\t" << getName() << " allocate nbytes " << nbytes
+              << " alignment " << alignment << std::endl;
+    void* ptr = memoryManager::allocate<char>(nbytes);
+    std::cout << "\t\t" << getName() << "          ptr " << ptr << std::endl;
+    return ptr;
+  }
+
+  // Override the deallocate method.
+  void deallocate(void* ptr) override
+  {
+    std::cout << "\t\t" << getName() << " deallocate ptr " << ptr << std::endl;
+    memoryManager::deallocate(ptr);
+  }
+
+  // Override the release method.
+  // Used to release memory held by things like pool allocators.
+  // Called on the old allocator when changing allocators via
+  // RAJA::*::set_*_allocator and RAJA::*::reset_*_allocator calls.
+  void release() override
+  {
+    std::cout << "\t\t" << getName() << " release" << std::endl;
+  }
+
+  // override the getName method
+  const std::string& getName() const noexcept override
+  {
+    return m_name;
+  }
+private:
+  std::string m_name;
+};
+
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
+// Allocator class derived from RAJA::Allocator
+// using memoryManager gpu allocations.
+struct ExampleAllocatorGPU : RAJA::Allocator
+{
+  // Allocators may take any constructor args as they are passed through the
+  // RAJA::*::set_*_allocator calls to the constructor.
+  ExampleAllocatorGPU(const std::string& name)
+    : m_name(name)
+  {
+    std::cout << "\t\t" << getName() << " constructor" << std::endl;
+  }
+
+  // Virtual destructor.
+  // Care should be taken as this may be called after main has returned.
+  virtual ~ExampleAllocatorGPU()
+  {
+    std::cout << "\t\t" << getName() << " destructor" << std::endl;
+  }
+
+  // Override the allocate method.
+  void* allocate(size_t nbytes,
+                 size_t alignment) override
+  {
+    std::cout << "\t\t" << getName() << " allocate nbytes " << nbytes
+              << " alignment " << alignment << std::endl;
+    void* ptr = memoryManager::allocate_gpu<char>(nbytes);
+    std::cout << "\t\t" << getName() << "          ptr " << ptr << std::endl;
+    return ptr;
+  }
+
+  // Override the deallocate method.
+  void deallocate(void* ptr) override
+  {
+    std::cout << "\t\t" << getName() << " deallocate ptr " << ptr << std::endl;
+    memoryManager::deallocate_gpu(ptr);
+  }
+
+  // Override the release method.
+  // Used to release memory held by things like pool allocators.
+  // Called on the old allocator when changing allocators via
+  // RAJA::*::set_*_allocator and RAJA::*::reset_*_allocator calls.
+  void release() override
+  {
+    std::cout << "\t\t" << getName() << " release" << std::endl;
+  }
+
+  // override the getName method
+  const std::string& getName() const noexcept override
+  {
+    return m_name;
+  }
+private:
+  std::string m_name;
+};
+
+#endif
+
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA allocators example...\n";
+
+//
+// Define array length
+//
+  const int N = 1000000;
+
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
+  int* a = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
+      a[i] = 1;
+    } else {
+      a[i] = -1;
+    }
+  }
+
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
+  RAJA::RangeSegment arange(0, N);
+
+
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA sequential reduction...\n";
+
+    using EXEC_POL1   = RAJA::seq_exec;
+    using REDUCE_POL1 = RAJA::seq_reduce;
+
+    RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
+
+    RAJA::forall<EXEC_POL1>(arange, [=](int i) {
+
+      seq_sum += a[i];
+
+    });
+
+    std::cout << "\tsum = " << seq_sum.get() << std::endl;
+  }
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+  {
+    std::cout << "\n Running RAJA OpenMP reduction...\n";
+
+    using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+    using REDUCE_POL2 = RAJA::omp_reduce;
+
+    RAJA::ReduceSum<REDUCE_POL2, int> omp_sum(0);
+
+    RAJA::forall<EXEC_POL2>(arange, [=](int i) {
+
+      omp_sum += a[i];
+
+    });
+
+    std::cout << "\tsum = " << omp_sum.get() << std::endl;
+  }
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  {
+    std::cout << "\n Setting RAJA CUDA device allocator...\n";
+
+    RAJA::cuda::set_device_allocator<ExampleAllocatorGPU>("CUDA_ExampleAllocatorGPU");
+
+    std::cout << "\n Getting RAJA CUDA device allocator...\n";
+
+    RAJA::Allocator& cuda_device_allocator = RAJA::cuda::get_device_allocator();
+
+    std::cout << "\n Got RAJA CUDA device allocator " << cuda_device_allocator.getName() << "...\n";
+
+    {
+      std::cout << "\n Running RAJA CUDA reduction...\n";
+
+      using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+      using REDUCE_POL3 = RAJA::cuda_reduce;
+
+      std::cout << "\n Constructing RAJA CUDA reduction object...\n";
+      RAJA::ReduceSum<REDUCE_POL3, int> cuda_sum(0);
+
+      std::cout << "\n Running RAJA CUDA reduction kernel...\n";
+      RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
+
+        cuda_sum += a[i];
+
+      });
+
+
+      std::cout << "\n Getting RAJA CUDA reduction result...\n";
+
+      int result = cuda_sum.get();
+
+      std::cout << "\tsum = " << result << std::endl;
+    }
+
+    std::cout << "\n Resetting RAJA CUDA device allocator...\n";
+
+    RAJA::cuda::reset_device_allocator();
+
+    std::cout << "\n Done with RAJA CUDA device allocator...\n";
+  }
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  {
+    std::cout << "\n Setting RAJA HIP device allocator...\n";
+
+    RAJA::hip::set_device_allocator<ExampleAllocatorGPU>("HIP_ExampleAllocatorGPU");
+
+    std::cout << "\n Getting RAJA HIP device allocator...\n";
+
+    RAJA::Allocator& hip_device_allocator = RAJA::hip::get_device_allocator();
+
+    std::cout << "\n Got RAJA HIP device allocator " << hip_device_allocator.getName() << "...\n";
+
+    {
+      int* d_a = memoryManager::allocate_gpu<int>(N);
+      hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+
+      using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+      using REDUCE_POL3 = RAJA::hip_reduce;
+
+      std::cout << "\n Constructing RAJA HIP reduction object...\n";
+      RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
+
+      std::cout << "\n Running RAJA HIP reduction kernel...\n";
+      RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
+
+        hip_sum += d_a[i];
+
+      });
+
+      std::cout << "\n Getting RAJA HIP reduction result...\n";
+
+      int result = hip_sum.get();
+
+      std::cout << "\tsum = " << result << std::endl;
+
+      memoryManager::deallocate_gpu(d_a);
+    }
+
+    std::cout << "\n Resetting RAJA HIP device allocator...\n";
+
+    RAJA::hip::reset_device_allocator();
+
+    std::cout << "\n Done with RAJA HIP device allocator...\n";
+  }
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(a);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}