rapidsai · harrism · Sep 4, 2024 · Sep 4, 2024
@@ -19,10 +19,9 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/common/utils.hpp>
 
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-
 #include <cuda_runtime.h>
 
 #include <benchmark/benchmark.h>
@@ -165,15 +164,15 @@ class Fixture : public ::benchmark::Fixture {
   void alloc(T*& ptr, size_t len, bool init = false)
   {
     auto nBytes  = len * sizeof(T);
-    auto d_alloc = rmm::mr::get_current_device_resource();
+    auto d_alloc = raft::resource::get_current_device_resource();
     ptr          = (T*)d_alloc->allocate(nBytes, stream);
     if (init) { RAFT_CUDA_TRY(cudaMemsetAsync(ptr, 0, nBytes, stream)); }
   }
 
   template <typename T>
   void dealloc(T* ptr, size_t len)
   {
-    auto d_alloc = rmm::mr::get_current_device_resource();
+    auto d_alloc = raft::resource::get_current_device_resource();
     d_alloc->deallocate(ptr, len * sizeof(T), stream);
   }
 

@@ -21,9 +21,10 @@
 
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <algorithm>
 #include <cmath>
@@ -200,11 +201,11 @@ int main(int argc, char* argv[])
   /* ======================= Begin GPU memory allocation ======================= */
   std::cout << "***************************************" << std::endl;
 
-  cudaStream_t stream;
+  rmm::cuda_stream stream;
   raft::handle_t handle{stream};
 
   // Begin recording time
-  cudaEventRecord(start, stream);
+  CUDA_RT_CALL(cudaEventRecord(start, stream.value()));
 
   rmm::device_uvector<float> dX_train(n_cols * n_train_rows, stream);
   rmm::device_uvector<float> dy_train(n_train_rows, stream);
@@ -215,46 +216,54 @@ int main(int argc, char* argv[])
   rmm::device_uvector<float> dy_pred(n_test_rows, stream);
   rmm::device_scalar<float> d_score{stream};
 
-  cg::program_t d_finalprogs;  // pointer to last generation ASTs on device
-
   CUDA_RT_CALL(cudaMemcpyAsync(dX_train.data(),
                                X_train.data(),
                                sizeof(float) * dX_train.size(),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
 
   CUDA_RT_CALL(cudaMemcpyAsync(dy_train.data(),
                                y_train.data(),
                                sizeof(float) * dy_train.size(),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
 
   CUDA_RT_CALL(cudaMemcpyAsync(dw_train.data(),
                                w_train.data(),
                                sizeof(float) * dw_train.size(),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
 
-  CUDA_RT_CALL(cudaMemcpyAsync(
-    dX_test.data(), X_test.data(), sizeof(float) * dX_test.size(), cudaMemcpyHostToDevice, stream));
+  CUDA_RT_CALL(cudaMemcpyAsync(dX_test.data(),
+                               X_test.data(),
+                               sizeof(float) * dX_test.size(),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
 
-  CUDA_RT_CALL(cudaMemcpyAsync(
-    dy_test.data(), y_test.data(), sizeof(float) * dy_test.size(), cudaMemcpyHostToDevice, stream));
+  CUDA_RT_CALL(cudaMemcpyAsync(dy_test.data(),
+                               y_test.data(),
+                               sizeof(float) * dy_test.size(),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
 
-  CUDA_RT_CALL(cudaMemcpyAsync(
-    dw_test.data(), w_test.data(), sizeof(float) * n_test_rows, cudaMemcpyHostToDevice, stream));
+  CUDA_RT_CALL(cudaMemcpyAsync(dw_test.data(),
+                               w_test.data(),
+                               sizeof(float) * n_test_rows,
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
 
   // Initialize AST
-  auto curr_mr = rmm::mr::get_current_device_resource();
-  d_finalprogs = static_cast<cg::program_t>(curr_mr->allocate(params.population_size, stream));
+  auto prog_buffer = rmm::device_buffer(params.population_size, stream);
+  // pointer to last generation ASTs on device
+  cg::program_t d_finalprogs = static_cast<cg::program_t>(prog_buffer.data());
 
   std::vector<std::vector<cg::program>> history;
   history.reserve(params.generations);
 
-  cudaEventRecord(stop, stream);
-  cudaEventSynchronize(stop);
+  CUDA_RT_CALL(cudaEventRecord(stop, stream.value()));
+  CUDA_RT_CALL(cudaEventSynchronize(stop));
   float alloc_time;
-  cudaEventElapsedTime(&alloc_time, start, stop);
+  CUDA_RT_CALL(cudaEventElapsedTime(&alloc_time, start, stop));
 
   std::cout << "Allocated device memory in " << std::setw(10) << alloc_time << "ms" << std::endl;
 
@@ -263,7 +272,7 @@ int main(int argc, char* argv[])
   std::cout << "***************************************" << std::endl;
   std::cout << std::setw(30) << "Beginning training for " << std::setw(15) << params.generations
             << " generations" << std::endl;
-  cudaEventRecord(start, stream);
+  CUDA_RT_CALL(cudaEventRecord(start, stream.value()));
 
   cg::symFit(handle,
              dX_train.data(),
@@ -275,10 +284,10 @@ int main(int argc, char* argv[])
              d_finalprogs,
              history);
 
-  cudaEventRecord(stop, stream);
-  cudaEventSynchronize(stop);
+  CUDA_RT_CALL(cudaEventRecord(stop, stream.value()));
+  CUDA_RT_CALL(cudaEventSynchronize(stop));
   float training_time;
-  cudaEventElapsedTime(&training_time, start, stop);
+  CUDA_RT_CALL(cudaEventElapsedTime(&training_time, start, stop));
 
   int n_gen = params.num_epochs;
   std::cout << std::setw(30) << "Convergence achieved in " << std::setw(15) << n_gen
@@ -308,7 +317,7 @@ int main(int argc, char* argv[])
 
   std::cout << "***************************************" << std::endl;
   std::cout << "Beginning Inference on test dataset " << std::endl;
-  cudaEventRecord(start, stream);
+  CUDA_RT_CALL(cudaEventRecord(start, stream.value()));
   cuml::genetic::symRegPredict(
     handle, dX_test.data(), n_test_rows, d_finalprogs + best_idx, dy_pred.data());
 
@@ -319,10 +328,10 @@ int main(int argc, char* argv[])
   cuml::genetic::compute_metric(
     handle, n_test_rows, 1, dy_test.data(), dy_pred.data(), dw_test.data(), d_score.data(), params);
 
-  cudaEventRecord(stop, stream);
-  cudaEventSynchronize(stop);
+  CUDA_RT_CALL(cudaEventRecord(stop, stream.value()));
+  CUDA_RT_CALL(cudaEventSynchronize(stop));
   float inference_time;
-  cudaEventElapsedTime(&inference_time, start, stop);
+  CUDA_RT_CALL(cudaEventElapsedTime(&inference_time, start, stop));
 
   // Output fitness score
   std::cout << "Inference score = " << d_score.value(stream) << std::endl;
@@ -336,9 +345,6 @@ int main(int argc, char* argv[])
   std::copy(y_test.begin(), y_test.begin() + 5, std::ostream_iterator<float>(std::cout, ";"));
   std::cout << std::endl;
 
-  /* ======================= Reset data ======================= */
-
-  curr_mr->deallocate(d_finalprogs, params.population_size, stream);
   CUDA_RT_CALL(cudaEventDestroy(start));
   CUDA_RT_CALL(cudaEventDestroy(stop));
   return 0;

@@ -16,10 +16,10 @@
 
 #pragma once
 
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/aligned.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime.h>
@@ -81,7 +81,7 @@ struct ARIMAParams {
    */
   void allocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
   {
-    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref rmm_alloc = raft::resource::get_current_device_resource();
     if (order.k && !tr)
       mu = (DataT*)rmm_alloc.allocate_async(
         batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
@@ -115,7 +115,7 @@ struct ARIMAParams {
    */
   void deallocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
   {
-    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref rmm_alloc = raft::resource::get_current_device_resource();
     if (order.k && !tr)
       rmm_alloc.deallocate_async(
         mu, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);

@@ -23,14 +23,14 @@
 #include <cuml/genetic/genetic.h>
 #include <cuml/genetic/program.h>
 
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <device_launch_parameters.h>
 
@@ -229,17 +229,17 @@ void parallel_evolve(const raft::handle_t& h,
     program tmp(h_nextprogs[i]);
     delete[] tmp.nodes;
 
+    auto mr = raft::resource::get_current_device_resource_ref();
+
     // Set current generation device nodes
-    tmp.nodes = (node*)rmm::mr::get_current_device_resource()->allocate(
-      h_nextprogs[i].len * sizeof(node), stream);
+    tmp.nodes = static_cast<node*>(mr.allocate_async(h_nextprogs[i].len * sizeof(node), stream));
     raft::copy(tmp.nodes, h_nextprogs[i].nodes, h_nextprogs[i].len, stream);
     raft::copy(d_nextprogs + i, &tmp, 1, stream);
 
     if (generation > 1) {
       // Free device memory allocated to program nodes in previous generation
       raft::copy(&tmp, d_oldprogs + i, 1, stream);
-      rmm::mr::get_current_device_resource()->deallocate(
-        tmp.nodes, h_nextprogs[i].len * sizeof(node), stream);
+      mr.deallocate_async(tmp.nodes, h_nextprogs[i].len * sizeof(node), stream);
     }
 
     tmp.nodes = nullptr;
@@ -408,8 +408,9 @@ void symFit(const raft::handle_t& handle,
   std::vector<float> h_fitness(params.population_size, 0.0f);
 
   program_t d_currprogs;  // pointer to current programs
-  d_currprogs = (program_t)rmm::mr::get_current_device_resource()->allocate(
-    params.population_size * sizeof(program), stream);
+  auto mr = raft::resource::get_current_device_resource_ref();
+  d_currprogs =
+    static_cast<program_t>(mr.allocate_async(params.population_size * sizeof(program), stream));
   program_t d_nextprogs = final_progs;  // Reuse memory already allocated for final_progs
   final_progs           = nullptr;
 
@@ -490,8 +491,7 @@ void symFit(const raft::handle_t& handle,
   if (growAuto) { params.terminalRatio = 0.0f; }
 
   // Deallocate the previous generation device memory
-  rmm::mr::get_current_device_resource()->deallocate(
-    d_nextprogs, params.population_size * sizeof(program), stream);
+  mr.deallocate_async(d_nextprogs, params.population_size * sizeof(program), stream);
   d_currprogs = nullptr;
   d_nextprogs = nullptr;
 }

@@ -329,13 +329,13 @@ LinearSVMModel<T> LinearSVMModel<T>::allocate(const raft::handle_t& handle,
                                               const std::size_t nClasses)
 {
   auto stream                = handle.get_stream();
-  auto res                   = rmm::mr::get_current_device_resource();
+  auto res                   = raft::resource::get_current_device_resource_ref();
   const std::size_t coefRows = nCols + params.fit_intercept;
   const std::size_t coefCols = nClasses <= 2 ? 1 : nClasses;
   const std::size_t wSize    = coefRows * coefCols;
   const std::size_t cSize    = nClasses >= 2 ? nClasses : 0;
   const std::size_t pSize    = params.probability ? 2 * coefCols : 0;
-  auto bytes = static_cast<T*>(res->allocate(sizeof(T) * (wSize + cSize + pSize), stream));
+  auto bytes = static_cast<T*>(res.allocate_async(sizeof(T) * (wSize + cSize + pSize), stream));
   return LinearSVMModel<T>{/* .w         */ bytes,
                            /* .classes   */ cSize > 0 ? bytes + wSize : nullptr,
                            /* .probScale */ pSize > 0 ? bytes + wSize + cSize : nullptr,
@@ -347,13 +347,13 @@ template <typename T>
 void LinearSVMModel<T>::free(const raft::handle_t& handle, LinearSVMModel<T>& model)
 {
   auto stream                = handle.get_stream();
-  auto res                   = rmm::mr::get_current_device_resource();
+  auto res                   = raft::resource::get_current_device_resource_ref();
   const std::size_t coefRows = model.coefRows;
   const std::size_t coefCols = model.coefCols();
   const std::size_t wSize    = coefRows * coefCols;
   const std::size_t cSize    = model.nClasses;
   const std::size_t pSize    = model.probScale == nullptr ? 2 * coefCols : 0;
-  res->deallocate(model.w, sizeof(T) * (wSize + cSize + pSize), stream);
+  res.deallocate_async(model.w, sizeof(T) * (wSize + cSize + pSize), stream);
   model.w         = nullptr;
   model.classes   = nullptr;
   model.probScale = nullptr;

@@ -20,8 +20,10 @@
 #include "ws_util.cuh"
 
 #include <cuml/svm/svm_model.h>
+#include <cuml/svm/svm_parameter.h>
 
 #include <raft/core/handle.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/init.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
@@ -31,7 +33,6 @@
 
 #include <rmm/aligned.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_select.cuh>
@@ -73,7 +74,7 @@ class Results {
           const math_t* y,
           const math_t* C,
           SvmType svmType)
-    : rmm_alloc(rmm::mr::get_current_device_resource()),
+    : rmm_alloc(raft::resource::get_current_device_resource_ref()),
       stream(handle.get_stream()),
       handle(handle),
       n_rows(n_rows),

@@ -762,14 +762,15 @@ void extractRows(raft::device_csr_matrix_view<math_t, int, int, int> matrix_in,
   math_t* data_in    = matrix_in.get_elements().data();
 
   // allocate indptr
-  auto* rmm_alloc = rmm::mr::get_current_device_resource();
-  *indptr_out     = (int*)rmm_alloc->allocate((num_indices + 1) * sizeof(int), stream);
+  auto rmm_alloc = raft::resource::get_current_device_resource_ref();
+  *indptr_out =
+    static_cast<int*>(rmm_alloc.allocate_async((num_indices + 1) * sizeof(int), stream));
 
   *nnz = computeIndptrForSubset(indptr_in, *indptr_out, row_indices, num_indices, stream);
 
   // allocate indices, data
-  *indices_out = (int*)rmm_alloc->allocate(*nnz * sizeof(int), stream);
-  *data_out    = (math_t*)rmm_alloc->allocate(*nnz * sizeof(math_t), stream);
+  *indices_out = static_cast<int*>(rmm_alloc.allocate_async(*nnz * sizeof(int), stream));
+  *data_out    = static_cast<math_t*>(rmm_alloc.allocate_async(*nnz * sizeof(math_t), stream));
 
   // copy with 1 warp per row for now, blocksize 256
   const dim3 bs(32, 8, 1);

@@ -28,13 +28,13 @@
 #include <cuml/svm/svm_parameter.h>
 
 #include <raft/core/handle.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/gemv.cuh>
 
 #include <rmm/aligned.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
@@ -71,9 +71,9 @@ void svcFitX(const raft::handle_t& handle,
   cudaStream_t stream = handle_impl.get_stream();
   {
     rmm::device_uvector<math_t> unique_labels(0, stream);
-    model.n_classes = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
-    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
-    model.unique_labels                      = (math_t*)rmm_alloc.allocate_async(
+    model.n_classes     = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
+    auto rmm_alloc      = raft::resource::get_current_device_resource_ref();
+    model.unique_labels = (math_t*)rmm_alloc.allocate_async(
       model.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(model.unique_labels, unique_labels.data(), model.n_classes, stream);
     handle_impl.sync_stream(stream);
@@ -356,7 +356,7 @@ template <typename math_t>
 void svmFreeBuffers(const raft::handle_t& handle, SvmModel<math_t>& m)
 {
   cudaStream_t stream                      = handle.get_stream();
-  rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+  rmm::device_async_resource_ref rmm_alloc = raft::resource::get_current_device_resource_ref();
   if (m.dual_coefs)
     rmm_alloc.deallocate_async(
       m.dual_coefs, m.n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);