extra device memory allocation for reduce kernel

st-tech · Jan 10, 2025 · 457d845 · 457d845
1 parent 757639e
commit 457d845
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 12 deletions.
diff --git a/examples/domino.ipynb b/examples/domino.ipynb
@@ -11,11 +11,11 @@
     "from frontend import App\n",
     "\n",
     "app = App.create(\"domino\")\n",
-    "V, F, T = app.mesh.box().subdivide(2).tetrahedralize().scale(0.1, 0.3, 0.025)\n",
+    "V, F, T = app.mesh.box().subdivide(1).tetrahedralize().scale(0.1, 0.3, 0.025)\n",
     "app.asset.add.tet(\"block\", V, F, T)\n",
     "scene = app.scene.create(\"domino\")\n",
     "\n",
-    "R, minR, d, N, C = 1.25, 0.6, 0.15, 4096, 3\n",
+    "R, minR, d, N, C = 1.0, 0.5, 0.15, 4096, 2\n",
     "xz = None\n",
     "for i in reversed(range(N)):\n",
     "    t = 2.0 * C * np.pi * i / N - np.pi / 2\n",
@@ -44,13 +44,13 @@
    "source": [
     "param = app.session.param()\n",
     "(\n",
-    "    param.set(\"volume-young-mod\", 6000)\n",
-    "    .set(\"volume-poiss-rat\", 0.49)\n",
+    "    param.set(\"volume-young-mod\", 1000)\n",
+    "    .set(\"volume-poiss-rat\", 0.499)\n",
     "    .set(\"friction\", 0.1)\n",
     "    .set(\"min-newton-steps\", 32)\n",
     "    .set(\"dt\", 0.01)\n",
-    "    .set(\"fps\", 15)\n",
-    "    .set(\"frames\", 300)\n",
+    "    .set(\"fps\", 30)\n",
+    "    .set(\"frames\", 200)\n",
     ")\n",
     "\n",
     "session = app.session.create(fixed)\n",

diff --git a/src/cpp/utility/utility.cu b/src/cpp/utility/utility.cu
@@ -219,19 +219,20 @@ __global__ void reduce_op_kernel(const T *input, Y *output, Op func, Y init_val,
 template <class T, class Y, typename Op>
 Y reduce(const T *d_input, Op func, Y init_val, unsigned n) {
     unsigned grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    const unsigned scale_factor = 2;
     static Y *d_output = nullptr;
     static Y *h_results = nullptr;
     static unsigned max_grid_size = 0;
     if (d_output == nullptr) {
-        cudaMalloc(&d_output, grid_size * sizeof(Y));
-        h_results = new Y[grid_size];
-        max_grid_size = grid_size;
+        max_grid_size = scale_factor * grid_size;
+        cudaMalloc(&d_output, max_grid_size * sizeof(Y));
+        h_results = new Y[max_grid_size];
     } else if (grid_size > max_grid_size) {
-        max_grid_size = grid_size;
+        max_grid_size = scale_factor * grid_size;
         cudaFree(d_output);
         delete[] h_results;
-        cudaMalloc(&d_output, grid_size * sizeof(Y));
-        h_results = new Y[grid_size];
+        cudaMalloc(&d_output, max_grid_size * sizeof(Y));
+        h_results = new Y[max_grid_size];
     }
     size_t shared_mem_size = sizeof(Y) * BLOCK_SIZE;
     reduce_op_kernel<T, Y><<<grid_size, BLOCK_SIZE, shared_mem_size>>>(