Skip to content

Commit

Permalink
remove dyanmic device memory alloc free in reduce kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
ryichando committed Jan 13, 2025
1 parent fc5b58d commit 70a204b
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 81 deletions.
62 changes: 6 additions & 56 deletions src/cpp/main/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ void initialize(DataSet _host_dataset, DataSet _dev_dataset, ParamSet *_param) {
unsigned face_count = host_dataset.mesh.mesh.face.size;
unsigned hinge_count = host_dataset.mesh.mesh.hinge.size;
unsigned tet_count = host_dataset.mesh.mesh.tet.size;

const unsigned max_reduce_count = std::max(
std::max(face_count, edge_count), std::max(tet_count, 3 * vert_count));
utility::set_max_reduce_count(max_reduce_count);

unsigned collision_mesh_vert_count =
host_dataset.constraint.mesh.active
? host_dataset.constraint.mesh.vertex.size
Expand Down Expand Up @@ -238,43 +243,8 @@ StepResult advance() {
const unsigned shell_face_count = host_dataset.shell_face_count;
const unsigned tet_count = host_data.mesh.mesh.tet.size;
const float strain_limit_sum = prm.strain_limit_tau + prm.strain_limit_eps;
SimpleLog::set(prm.time);

// Name: Vertex Count
// Format: list[(vid_time,int)]
// Description:
// Total vertex count in the scene. The format is time-dependent
// but should not change during the simulation.
logging.mark("vertex count", vertex_count);

// Name: Rod Count
// Format: list[(vid_time,int)]
// Description:
// Total edge rod element count in the scene. The format is time-dependent
// but should not change during the simulation.
logging.mark("rod count", host_data.rod_count);

// Name: Shell Count
// Format: list[(vid_time,int)]
// Description:
// Total triangular shell element count in the scene. The format is
// time-dependent but should not change during the simulation.
logging.mark("shell count", host_data.shell_face_count);

// Name: Triangle Count
// Format: list[(vid_time,int)]
// Map: triangle_count
// Description:
// Total triangular shell element count in the scene. The format is
// time-dependent but should not change during the simulation.
logging.mark("face count", host_data.mesh.mesh.face.size);

// Name: Tet Count
// Format: list[(vid_time,int)]
// Description:
// Total tetrahedral element count in the scene. The format is
// time-dependent but should not change during the simulation.
logging.mark("tet count", host_data.mesh.mesh.tet.size);
SimpleLog::set(prm.time);

logging.push("build_kinematic");
build_kinematic(host_dataset, dev_dataset, *param);
Expand Down Expand Up @@ -307,16 +277,6 @@ StepResult advance() {
tmp_scalar[i] = data.prop.face[i].mass;
}
} DISPATCH_END;
total_shell_mass = utility::sum_array(tmp_scalar, shell_face_count);
}

if (total_shell_mass > 0.0f) {
// Name: Total Shell Mass
// Format: list[(vid_time,kg)]
// Description:
// Total mass of all the shell elements in the scene.
// Should not change during the simulation.
logging.mark("total shell mass", total_shell_mass);
}

float total_solid_mass = 0.0f;
Expand All @@ -328,16 +288,6 @@ StepResult advance() {
tmp_scalar[i] = data.prop.tet[i].mass;
}
} DISPATCH_END;
total_solid_mass = utility::sum_array(tmp_scalar, tet_count);
}

if (total_solid_mass > 0.0f) {
// Name: Total Solid Mass
// Format: list[(vid_time,kg)]
// Description:
// Total mass of all the tet elements in the scene.
// Should not change during the simulation.
logging.mark("total solid mass", total_solid_mass);
}

float dt = param->dt;
Expand Down
61 changes: 36 additions & 25 deletions src/cpp/utility/utility.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,22 @@

namespace utility {

struct ReduceInfo {
unsigned n = 0;
unsigned *d_block_sums = nullptr;
unsigned *h_results = nullptr;

void init(unsigned n) {
this->n = n;
unsigned num_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
CUDA_HANDLE_ERROR(
cudaMalloc(&d_block_sums, num_blocks * sizeof(unsigned)));
h_results = new unsigned[num_blocks];
}
};

static ReduceInfo reduce_info;

__device__ Vec3f compute_vertex_normal(const DataSet &data,
const Vec<Vec3f> &vertex, unsigned i) {
Vec3f normal = Vec3f::Zero();
Expand Down Expand Up @@ -216,32 +232,25 @@ __global__ void reduce_op_kernel(const T *input, Y *output, Op func, Y init_val,

template <class T, class Y, typename Op>
Y reduce(const T *d_input, Op func, Y init_val, unsigned n) {
unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
const unsigned scale_factor = 2;
static Y *d_output = nullptr;
static Y *h_results = nullptr;
static unsigned max_grid_size = 0;
if (d_output == nullptr) {
max_grid_size = scale_factor * grid_size;
cudaMalloc(&d_output, max_grid_size * sizeof(Y));
h_results = new Y[max_grid_size];
} else if (grid_size > max_grid_size) {
max_grid_size = scale_factor * grid_size;
cudaFree(d_output);
delete[] h_results;
cudaMalloc(&d_output, max_grid_size * sizeof(Y));
h_results = new Y[max_grid_size];
}
size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(
d_input, d_output, func, init_val, n);
cudaMemcpy(h_results, d_output, grid_size * sizeof(Y),
cudaMemcpyDeviceToHost);
Y result = init_val;
for (unsigned i = 0; i < grid_size; i++) {
result = func(result, h_results[i]);
if (sizeof(Y) * n <= sizeof(unsigned) * reduce_info.n) {
unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
Y *d_output = reinterpret_cast<Y *>(reduce_info.d_block_sums);
Y *h_results = reinterpret_cast<Y *>(reduce_info.h_results);
size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(
d_input, d_output, func, init_val, n);
cudaMemcpy(h_results, d_output, grid_size * sizeof(Y),
cudaMemcpyDeviceToHost);
Y result = init_val;
for (unsigned i = 0; i < grid_size; i++) {
result = func(result, h_results[i]);
}
return result;
} else {
fprintf(stderr, "Error: reduce buffer size is too small\n");
fprintf(stderr, "n: %u, reduce_info.n: %u\n", n, reduce_info.n);
exit(1);
}
return result;
}

template <class T> T sum_array(Vec<T> array, unsigned size) {
Expand Down Expand Up @@ -288,6 +297,8 @@ __device__ float get_wind_weight(float time) {
return t * (0.5f * (1.0f + sinf(angle))) + (1.0f - t);
}

void set_max_reduce_count(unsigned n) { reduce_info.init(n); }

} // namespace utility

template float utility::sum_array(Vec<float> array, unsigned size);
Expand Down
2 changes: 2 additions & 0 deletions src/cpp/utility/utility.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ void compute_svd(DataSet data, Vec<Vec3f> curr, Vec<Svd3x2> svd,
ParamSet param);
__device__ float get_wind_weight(float time);

void set_max_reduce_count(unsigned n);

} // namespace utility

#endif

0 comments on commit 70a204b

Please sign in to comment.