Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use raft current_resource wrappers #6059

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions cpp/bench/common/ml_benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@
#include <cuml/common/logger.hpp>
#include <cuml/common/utils.hpp>

#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/util/cudart_utils.hpp>

#include <rmm/mr/device/per_device_resource.hpp>

#include <cuda_runtime.h>

#include <benchmark/benchmark.h>
Expand Down Expand Up @@ -165,15 +164,15 @@ class Fixture : public ::benchmark::Fixture {
void alloc(T*& ptr, size_t len, bool init = false)
{
auto nBytes = len * sizeof(T);
auto d_alloc = rmm::mr::get_current_device_resource();
auto d_alloc = raft::resource::get_current_device_resource();
ptr = (T*)d_alloc->allocate(nBytes, stream);
if (init) { RAFT_CUDA_TRY(cudaMemsetAsync(ptr, 0, nBytes, stream)); }
}

template <typename T>
void dealloc(T* ptr, size_t len)
{
auto d_alloc = rmm::mr::get_current_device_resource();
auto d_alloc = raft::resource::get_current_device_resource();
d_alloc->deallocate(ptr, len * sizeof(T), stream);
}

Expand Down
66 changes: 36 additions & 30 deletions cpp/examples/symreg/symreg_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@

#include <raft/util/cudart_utils.hpp>

#include <rmm/cuda_stream.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

#include <algorithm>
#include <cmath>
Expand Down Expand Up @@ -200,11 +201,11 @@ int main(int argc, char* argv[])
/* ======================= Begin GPU memory allocation ======================= */
std::cout << "***************************************" << std::endl;

cudaStream_t stream;
rmm::cuda_stream stream;
raft::handle_t handle{stream};

// Begin recording time
cudaEventRecord(start, stream);
CUDA_RT_CALL(cudaEventRecord(start, stream.value()));

rmm::device_uvector<float> dX_train(n_cols * n_train_rows, stream);
rmm::device_uvector<float> dy_train(n_train_rows, stream);
Expand All @@ -215,46 +216,54 @@ int main(int argc, char* argv[])
rmm::device_uvector<float> dy_pred(n_test_rows, stream);
rmm::device_scalar<float> d_score{stream};

cg::program_t d_finalprogs; // pointer to last generation ASTs on device

CUDA_RT_CALL(cudaMemcpyAsync(dX_train.data(),
X_train.data(),
sizeof(float) * dX_train.size(),
cudaMemcpyHostToDevice,
stream));
stream.value()));

CUDA_RT_CALL(cudaMemcpyAsync(dy_train.data(),
y_train.data(),
sizeof(float) * dy_train.size(),
cudaMemcpyHostToDevice,
stream));
stream.value()));

CUDA_RT_CALL(cudaMemcpyAsync(dw_train.data(),
w_train.data(),
sizeof(float) * dw_train.size(),
cudaMemcpyHostToDevice,
stream));
stream.value()));

CUDA_RT_CALL(cudaMemcpyAsync(
dX_test.data(), X_test.data(), sizeof(float) * dX_test.size(), cudaMemcpyHostToDevice, stream));
CUDA_RT_CALL(cudaMemcpyAsync(dX_test.data(),
X_test.data(),
sizeof(float) * dX_test.size(),
cudaMemcpyHostToDevice,
stream.value()));

CUDA_RT_CALL(cudaMemcpyAsync(
dy_test.data(), y_test.data(), sizeof(float) * dy_test.size(), cudaMemcpyHostToDevice, stream));
CUDA_RT_CALL(cudaMemcpyAsync(dy_test.data(),
y_test.data(),
sizeof(float) * dy_test.size(),
cudaMemcpyHostToDevice,
stream.value()));

CUDA_RT_CALL(cudaMemcpyAsync(
dw_test.data(), w_test.data(), sizeof(float) * n_test_rows, cudaMemcpyHostToDevice, stream));
CUDA_RT_CALL(cudaMemcpyAsync(dw_test.data(),
w_test.data(),
sizeof(float) * n_test_rows,
cudaMemcpyHostToDevice,
stream.value()));

// Initialize AST
auto curr_mr = rmm::mr::get_current_device_resource();
d_finalprogs = static_cast<cg::program_t>(curr_mr->allocate(params.population_size, stream));
auto prog_buffer = rmm::device_buffer(params.population_size, stream);
// pointer to last generation ASTs on device
cg::program_t d_finalprogs = static_cast<cg::program_t>(prog_buffer.data());

std::vector<std::vector<cg::program>> history;
history.reserve(params.generations);

cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
CUDA_RT_CALL(cudaEventRecord(stop, stream.value()));
CUDA_RT_CALL(cudaEventSynchronize(stop));
float alloc_time;
cudaEventElapsedTime(&alloc_time, start, stop);
CUDA_RT_CALL(cudaEventElapsedTime(&alloc_time, start, stop));

std::cout << "Allocated device memory in " << std::setw(10) << alloc_time << "ms" << std::endl;

Expand All @@ -263,7 +272,7 @@ int main(int argc, char* argv[])
std::cout << "***************************************" << std::endl;
std::cout << std::setw(30) << "Beginning training for " << std::setw(15) << params.generations
<< " generations" << std::endl;
cudaEventRecord(start, stream);
CUDA_RT_CALL(cudaEventRecord(start, stream.value()));

cg::symFit(handle,
dX_train.data(),
Expand All @@ -275,10 +284,10 @@ int main(int argc, char* argv[])
d_finalprogs,
history);

cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
CUDA_RT_CALL(cudaEventRecord(stop, stream.value()));
CUDA_RT_CALL(cudaEventSynchronize(stop));
float training_time;
cudaEventElapsedTime(&training_time, start, stop);
CUDA_RT_CALL(cudaEventElapsedTime(&training_time, start, stop));

int n_gen = params.num_epochs;
std::cout << std::setw(30) << "Convergence achieved in " << std::setw(15) << n_gen
Expand Down Expand Up @@ -308,7 +317,7 @@ int main(int argc, char* argv[])

std::cout << "***************************************" << std::endl;
std::cout << "Beginning Inference on test dataset " << std::endl;
cudaEventRecord(start, stream);
CUDA_RT_CALL(cudaEventRecord(start, stream.value()));
cuml::genetic::symRegPredict(
handle, dX_test.data(), n_test_rows, d_finalprogs + best_idx, dy_pred.data());

Expand All @@ -319,10 +328,10 @@ int main(int argc, char* argv[])
cuml::genetic::compute_metric(
handle, n_test_rows, 1, dy_test.data(), dy_pred.data(), dw_test.data(), d_score.data(), params);

cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
CUDA_RT_CALL(cudaEventRecord(stop, stream.value()));
CUDA_RT_CALL(cudaEventSynchronize(stop));
float inference_time;
cudaEventElapsedTime(&inference_time, start, stop);
CUDA_RT_CALL(cudaEventElapsedTime(&inference_time, start, stop));

// Output fitness score
std::cout << "Inference score = " << d_score.value(stream) << std::endl;
Expand All @@ -336,9 +345,6 @@ int main(int argc, char* argv[])
std::copy(y_test.begin(), y_test.begin() + 5, std::ostream_iterator<float>(std::cout, ";"));
std::cout << std::endl;

/* ======================= Reset data ======================= */

curr_mr->deallocate(d_finalprogs, params.population_size, stream);
CUDA_RT_CALL(cudaEventDestroy(start));
CUDA_RT_CALL(cudaEventDestroy(stop));
return 0;
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cuml/tsa/arima_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#pragma once

#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/util/cudart_utils.hpp>

#include <rmm/aligned.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda_runtime.h>
Expand Down Expand Up @@ -81,7 +81,7 @@ struct ARIMAParams {
*/
void allocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
{
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
rmm::device_async_resource_ref rmm_alloc = raft::resource::get_current_device_resource();
if (order.k && !tr)
mu = (DataT*)rmm_alloc.allocate_async(
batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
Expand Down Expand Up @@ -115,7 +115,7 @@ struct ARIMAParams {
*/
void deallocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
{
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
rmm::device_async_resource_ref rmm_alloc = raft::resource::get_current_device_resource();
if (order.k && !tr)
rmm_alloc.deallocate_async(
mu, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/genetic/genetic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@
#include <cuml/genetic/genetic.h>
#include <cuml/genetic/program.h>

#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/linalg/add.cuh>
#include <raft/linalg/unary_op.cuh>
#include <raft/random/rng.cuh>
#include <raft/util/cuda_utils.cuh>
#include <raft/util/cudart_utils.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>

#include <device_launch_parameters.h>

Expand Down Expand Up @@ -229,17 +229,17 @@ void parallel_evolve(const raft::handle_t& h,
program tmp(h_nextprogs[i]);
delete[] tmp.nodes;

auto mr = raft::resource::get_current_device_resource_ref();

// Set current generation device nodes
tmp.nodes = (node*)rmm::mr::get_current_device_resource()->allocate(
h_nextprogs[i].len * sizeof(node), stream);
tmp.nodes = static_cast<node*>(mr.allocate_async(h_nextprogs[i].len * sizeof(node), stream));
raft::copy(tmp.nodes, h_nextprogs[i].nodes, h_nextprogs[i].len, stream);
raft::copy(d_nextprogs + i, &tmp, 1, stream);

if (generation > 1) {
// Free device memory allocated to program nodes in previous generation
raft::copy(&tmp, d_oldprogs + i, 1, stream);
rmm::mr::get_current_device_resource()->deallocate(
tmp.nodes, h_nextprogs[i].len * sizeof(node), stream);
mr.deallocate_async(tmp.nodes, h_nextprogs[i].len * sizeof(node), stream);
}

tmp.nodes = nullptr;
Expand Down Expand Up @@ -408,8 +408,9 @@ void symFit(const raft::handle_t& handle,
std::vector<float> h_fitness(params.population_size, 0.0f);

program_t d_currprogs; // pointer to current programs
d_currprogs = (program_t)rmm::mr::get_current_device_resource()->allocate(
params.population_size * sizeof(program), stream);
auto mr = raft::resource::get_current_device_resource_ref();
d_currprogs =
static_cast<program_t>(mr.allocate_async(params.population_size * sizeof(program), stream));
program_t d_nextprogs = final_progs; // Reuse memory already allocated for final_progs
final_progs = nullptr;

Expand Down Expand Up @@ -490,8 +491,7 @@ void symFit(const raft::handle_t& handle,
if (growAuto) { params.terminalRatio = 0.0f; }

// Deallocate the previous generation device memory
rmm::mr::get_current_device_resource()->deallocate(
d_nextprogs, params.population_size * sizeof(program), stream);
mr.deallocate_async(d_nextprogs, params.population_size * sizeof(program), stream);
d_currprogs = nullptr;
d_nextprogs = nullptr;
}
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/svm/linear.cu
Original file line number Diff line number Diff line change
Expand Up @@ -329,13 +329,13 @@ LinearSVMModel<T> LinearSVMModel<T>::allocate(const raft::handle_t& handle,
const std::size_t nClasses)
{
auto stream = handle.get_stream();
auto res = rmm::mr::get_current_device_resource();
auto res = raft::resource::get_current_device_resource_ref();
const std::size_t coefRows = nCols + params.fit_intercept;
const std::size_t coefCols = nClasses <= 2 ? 1 : nClasses;
const std::size_t wSize = coefRows * coefCols;
const std::size_t cSize = nClasses >= 2 ? nClasses : 0;
const std::size_t pSize = params.probability ? 2 * coefCols : 0;
auto bytes = static_cast<T*>(res->allocate(sizeof(T) * (wSize + cSize + pSize), stream));
auto bytes = static_cast<T*>(res.allocate_async(sizeof(T) * (wSize + cSize + pSize), stream));
return LinearSVMModel<T>{/* .w */ bytes,
/* .classes */ cSize > 0 ? bytes + wSize : nullptr,
/* .probScale */ pSize > 0 ? bytes + wSize + cSize : nullptr,
Expand All @@ -347,13 +347,13 @@ template <typename T>
void LinearSVMModel<T>::free(const raft::handle_t& handle, LinearSVMModel<T>& model)
{
auto stream = handle.get_stream();
auto res = rmm::mr::get_current_device_resource();
auto res = raft::resource::get_current_device_resource_ref();
const std::size_t coefRows = model.coefRows;
const std::size_t coefCols = model.coefCols();
const std::size_t wSize = coefRows * coefCols;
const std::size_t cSize = model.nClasses;
const std::size_t pSize = model.probScale == nullptr ? 2 * coefCols : 0;
res->deallocate(model.w, sizeof(T) * (wSize + cSize + pSize), stream);
res.deallocate_async(model.w, sizeof(T) * (wSize + cSize + pSize), stream);
model.w = nullptr;
model.classes = nullptr;
model.probScale = nullptr;
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/svm/results.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
#include "ws_util.cuh"

#include <cuml/svm/svm_model.h>
#include <cuml/svm/svm_parameter.h>

#include <raft/core/handle.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/linalg/add.cuh>
#include <raft/linalg/init.cuh>
#include <raft/linalg/map_then_reduce.cuh>
Expand All @@ -31,7 +33,6 @@

#include <rmm/aligned.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cub/device/device_select.cuh>
Expand Down Expand Up @@ -73,7 +74,7 @@ class Results {
const math_t* y,
const math_t* C,
SvmType svmType)
: rmm_alloc(rmm::mr::get_current_device_resource()),
: rmm_alloc(raft::resource::get_current_device_resource_ref()),
stream(handle.get_stream()),
handle(handle),
n_rows(n_rows),
Expand Down
9 changes: 5 additions & 4 deletions cpp/src/svm/sparse_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -762,14 +762,15 @@ void extractRows(raft::device_csr_matrix_view<math_t, int, int, int> matrix_in,
math_t* data_in = matrix_in.get_elements().data();

// allocate indptr
auto* rmm_alloc = rmm::mr::get_current_device_resource();
*indptr_out = (int*)rmm_alloc->allocate((num_indices + 1) * sizeof(int), stream);
auto rmm_alloc = raft::resource::get_current_device_resource_ref();
*indptr_out =
static_cast<int*>(rmm_alloc.allocate_async((num_indices + 1) * sizeof(int), stream));

*nnz = computeIndptrForSubset(indptr_in, *indptr_out, row_indices, num_indices, stream);

// allocate indices, data
*indices_out = (int*)rmm_alloc->allocate(*nnz * sizeof(int), stream);
*data_out = (math_t*)rmm_alloc->allocate(*nnz * sizeof(math_t), stream);
*indices_out = static_cast<int*>(rmm_alloc.allocate_async(*nnz * sizeof(int), stream));
*data_out = static_cast<math_t*>(rmm_alloc.allocate_async(*nnz * sizeof(math_t), stream));

// copy with 1 warp per row for now, blocksize 256
const dim3 bs(32, 8, 1);
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/svm/svc_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
#include <cuml/svm/svm_parameter.h>

#include <raft/core/handle.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/distance/kernels.cuh>
#include <raft/label/classlabels.cuh>
#include <raft/linalg/gemv.cuh>

#include <rmm/aligned.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <thrust/copy.h>
Expand Down Expand Up @@ -71,9 +71,9 @@ void svcFitX(const raft::handle_t& handle,
cudaStream_t stream = handle_impl.get_stream();
{
rmm::device_uvector<math_t> unique_labels(0, stream);
model.n_classes = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
model.unique_labels = (math_t*)rmm_alloc.allocate_async(
model.n_classes = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
auto rmm_alloc = raft::resource::get_current_device_resource_ref();
model.unique_labels = (math_t*)rmm_alloc.allocate_async(
model.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
raft::copy(model.unique_labels, unique_labels.data(), model.n_classes, stream);
handle_impl.sync_stream(stream);
Expand Down Expand Up @@ -356,7 +356,7 @@ template <typename math_t>
void svmFreeBuffers(const raft::handle_t& handle, SvmModel<math_t>& m)
{
cudaStream_t stream = handle.get_stream();
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
rmm::device_async_resource_ref rmm_alloc = raft::resource::get_current_device_resource_ref();
if (m.dual_coefs)
rmm_alloc.deallocate_async(
m.dual_coefs, m.n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
Expand Down
Loading
Loading