From 3d446f156743a1ef5352debfdb550f93d2cf9a29 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 10 Oct 2024 23:29:47 +0200
Subject: [PATCH 01/19] fix constness issue

---
 .../cuda/experimental/__stf/internal/parallel_for_scope.cuh   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index 51ad219827f..f5b0b6d19d4 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -353,7 +353,7 @@ public:
     }();
 
     const auto [block_size, min_blocks] = conf;
-    const size_t n                      = sub_shape.size();
+    size_t n                      = sub_shape.size();
 
     // If there is no item in that shape, no need to launch a kernel !
     if (n == 0)
@@ -428,7 +428,7 @@ public:
 
       auto arg1                  = mv(explode_deps);
       auto arg2                  = deps.instance(t);
-      void* kernelArgs[]         = {(void*) &n, &arg1, &arg2};
+      void* kernelArgs[]         = {&n, &arg1, &arg2};
       kernel_params.kernelParams = kernelArgs;
       kernel_params.extra        = nullptr;
 

From 2b30f49f1be58cbe223985930da7df771d2db3d2 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 10 Oct 2024 23:30:12 +0200
Subject: [PATCH 02/19] use size_t instead of integers

---
 cudax/test/stf/examples/05-stencil-no-copy.cu | 20 +++++++++----------
 cudax/test/stf/freeze/freeze.cu               |  2 +-
 cudax/test/stf/graph/graph_composition.cu     |  4 +---
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/cudax/test/stf/examples/05-stencil-no-copy.cu b/cudax/test/stf/examples/05-stencil-no-copy.cu
index 6202e7fdab3..4d6e4f850dc 100644
--- a/cudax/test/stf/examples/05-stencil-no-copy.cu
+++ b/cudax/test/stf/examples/05-stencil-no-copy.cu
@@ -52,7 +52,7 @@ T check_sum(stream_ctx& ctx, data_block<T>& bn)
   auto t = ctx.task(exec_place::host, bn.handle.read());
   t->*[&](cudaStream_t stream, auto h_center) {
     cuda_safe_call(cudaStreamSynchronize(stream));
-    for (int offset = bn.ghost_size; offset < bn.ghost_size + bn.block_size; offset++)
+    for (size_t offset = bn.ghost_size; offset < bn.ghost_size + bn.block_size; offset++)
     {
       sum += h_center.data_handle()[offset];
     }
@@ -147,8 +147,8 @@ int main(int argc, char** argv)
   stream_ctx ctx;
 
   int NITER      = 500;
-  int NBLOCKS    = 4 * ndevs;
-  int BLOCK_SIZE = 2048 * 1024;
+  size_t NBLOCKS    = 4 * ndevs;
+  size_t BLOCK_SIZE = 2048 * 1024;
 
   if (argc > 1)
   {
@@ -174,16 +174,16 @@ int main(int argc, char** argv)
   std::vector<data_block<double>> Un1;
 
   // Create blocks and allocates host data
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t  b = 0; b < NBLOCKS; b++)
   {
-    int beg = b * BLOCK_SIZE;
-    int end = (b + 1) * BLOCK_SIZE;
+    size_t beg = b * BLOCK_SIZE;
+    size_t end = (b + 1) * BLOCK_SIZE;
 
     Un.emplace_back(ctx, beg, end, 1);
     Un1.emplace_back(ctx, beg, end, 1);
   }
 
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
     Un[b].dev_id  = b % ndevs;
     Un1[b].dev_id = b % ndevs;
@@ -192,14 +192,14 @@ int main(int argc, char** argv)
   // Fill blocks with initial values. For the sake of simplicity, we are
   // using a synchronization primitive and host code, but this could have
   // been written asynchronously using host callbacks.
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
-    int beg = b * BLOCK_SIZE;
+    size_t beg = b * BLOCK_SIZE;
 
     auto t = ctx.task(exec_place::host, Un[b].handle.rw(), Un1[b].handle.rw());
     t->*[&](cudaStream_t stream, auto Un_vals, auto Un1_vals) {
       cuda_safe_call(cudaStreamSynchronize(stream));
-      for (int local_idx = 0; local_idx < BLOCK_SIZE; local_idx++)
+      for (size_t local_idx = 0; local_idx < BLOCK_SIZE; local_idx++)
       {
         double val                                     = U0[(beg + local_idx + TOTAL_SIZE) % TOTAL_SIZE];
         Un1_vals.data_handle()[local_idx + GHOST_SIZE] = val;
diff --git a/cudax/test/stf/freeze/freeze.cu b/cudax/test/stf/freeze/freeze.cu
index 691552b9aa6..12ca1119e01 100644
--- a/cudax/test/stf/freeze/freeze.cu
+++ b/cudax/test/stf/freeze/freeze.cu
@@ -75,7 +75,7 @@ int main()
 
   ctx.finalize();
 
-  for (int i = 0; i < N; i++)
+  for (size_t i = 0; i < N; i++)
   {
     EXPECT(X[i] == 2 * X0(i) + 1);
   }
diff --git a/cudax/test/stf/graph/graph_composition.cu b/cudax/test/stf/graph/graph_composition.cu
index f8da5502e55..45d298d8bc1 100644
--- a/cudax/test/stf/graph/graph_composition.cu
+++ b/cudax/test/stf/graph/graph_composition.cu
@@ -90,12 +90,10 @@ int main()
   size_t N = 256 * 1024;
   size_t K = 8;
 
-  size_t BATCH_SIZE = 4;
-
   logical_data<slice<double>> lX[K];
   logical_data<slice<double>> lY[K];
 
-  for (int i = 0; i < K; i++)
+  for (size_t i = 0; i < K; i++)
   {
     lX[i] = ctx.logical_data<double>(N);
     lY[i] = ctx.logical_data<double>(N);

From c8d6eec1627ae60e54e55a9b97a7a5dec91df210 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Thu, 10 Oct 2024 23:40:02 +0200
Subject: [PATCH 03/19] fix constness issue

---
 cudax/test/stf/cpp/redundant_data.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/test/stf/cpp/redundant_data.cu b/cudax/test/stf/cpp/redundant_data.cu
index 95c4caf156b..d494c06d39a 100644
--- a/cudax/test/stf/cpp/redundant_data.cu
+++ b/cudax/test/stf/cpp/redundant_data.cu
@@ -75,7 +75,7 @@ int main()
 {
   double X[n], Y[n];
 
-  for (int ind = 0; ind < n; ind++)
+  for (size_t ind = 0; ind < n; ind++)
   {
     X[ind] = 1.0 * ind;
     Y[ind] = 2.0 * ind - 3.0;

From dfd293b2af163bb7a7ffcc6c638c960dac178673 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 00:28:53 +0200
Subject: [PATCH 04/19] Fix minor warnings

---
 .devcontainer/cccl-entrypoint.sh              |  5 +++-
 cudax/cmake/cudaxHeaderTesting.cmake          |  6 ++---
 .../__stf/internal/parallel_for_scope.cuh     |  1 -
 .../experimental/__stf/internal/slice.cuh     |  4 +--
 .../cuda/experimental/__stf/places/places.cuh | 10 +++----
 .../experimental/__stf/utility/traits.cuh     |  1 +
 cudax/test/stf/examples/05-stencil-no-copy.cu | 10 +++----
 cudax/test/stf/examples/05-stencil.cu         | 26 +++++++++----------
 .../test/stf/examples/05-stencil2d-places.cu  |  4 +--
 .../test/stf/freeze/constant_logical_data.cu  |  2 +-
 cudax/test/stf/freeze/task_fence.cu           |  2 +-
 cudax/test/stf/graph/for_each_batched.cu      |  6 ++---
 .../test/stf/graph/for_each_batched_write.cu  | 12 ++++-----
 cudax/test/stf/graph/freeze_for_graph.cu      |  2 +-
 cudax/test/stf/graph/graph_composition.cu     |  4 +--
 cudax/test/stf/green_context/gc_grid.cu       |  2 +-
 16 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/.devcontainer/cccl-entrypoint.sh b/.devcontainer/cccl-entrypoint.sh
index 79cd453c771..de42a5d5666 100755
--- a/.devcontainer/cccl-entrypoint.sh
+++ b/.devcontainer/cccl-entrypoint.sh
@@ -8,7 +8,10 @@ devcontainer-utils-post-create-command;
 devcontainer-utils-init-git;
 devcontainer-utils-post-attach-command;
 
-cd /home/coder/cccl/
+# cd /home/coder/cccl/
+echo "CLANG .."
+clang-format -i cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
+clang-format --version
 
 if test $# -gt 0; then
     exec "$@";
diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake
index c4cd43c5d21..d9a584b5d97 100644
--- a/cudax/cmake/cudaxHeaderTesting.cmake
+++ b/cudax/cmake/cudaxHeaderTesting.cmake
@@ -57,9 +57,9 @@ function(cudax_add_header_test label definitions)
         # FIXME: layout_left::mapping referenced before layout_left:
         cuda/experimental/__stf/supplemental_std_experimental/__p0009_bits/layout_left.hpp
 
-        # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
-        cuda/experimental/__stf/stream/stream_task.cuh
-        cuda/experimental/__stf/stream/stream_ctx.cuh
+        # # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
+        # cuda/experimental/__stf/stream/stream_task.cuh
+        # cuda/experimental/__stf/stream/stream_ctx.cuh
     )
     target_link_libraries(${headertest_target} PUBLIC ${cn_target})
     target_compile_definitions(${headertest_target} PRIVATE
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index f5b0b6d19d4..2344e809d7c 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -287,7 +287,6 @@ public:
         }
         else
         {
-          auto grid_dims   = t.grid_dims();
           size_t grid_size = t.grid_dims().size();
           for (size_t i = 0; i < grid_size; i++)
           {
diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
index 0e3157f4d17..5cfa19bc1f5 100644
--- a/cudax/include/cuda/experimental/__stf/internal/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
@@ -1065,7 +1065,7 @@ size_t data_hash([[maybe_unused]] mdspan<E, X, L, A> s, ::std::index_sequence<i.
  * Write the content of the mdspan into a file
  */
 template <typename E, typename X, typename L, typename A, size_t... i>
-void data_dump(mdspan<E, X, L, A> s,
+void data_dump([[maybe_unused]] mdspan<E, X, L, A> s,
                ::std::ostream& file        = ::std::cerr,
                ::std::index_sequence<i...> = ::std::index_sequence<>())
 {
@@ -1181,7 +1181,7 @@ struct std::hash<::cuda::experimental::stf::mdspan<P...>>
 
     if constexpr (_dimensions > 1)
     {
-      for (auto i = 1; i < _dimensions; i++)
+      for (size_t i = 1; i < _dimensions; i++)
       {
         cuda::experimental::stf::hash_combine(h, s.stride(i));
       }
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 9441ddf9f12..5f0914a9922 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -1537,7 +1537,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
     else
     {
-      if (l1_size > block_size_limit)
+      if (int(l1_size) > block_size_limit)
       {
         fprintf(stderr,
                 "Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 1)\n",
@@ -1553,7 +1553,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
 
     // Enforce the resource limits in the number of threads per block
-    assert(l1_size <= block_size_limit);
+    assert(int(l1_size) <= block_size_limit);
 
     assert(l0_size % ndevs == 0);
 
@@ -1589,7 +1589,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
     else
     {
-      if (l2_size > block_size_limit)
+      if (int(l2_size) > block_size_limit)
       {
         fprintf(stderr,
                 "Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 2)\n",
@@ -1610,8 +1610,8 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
 
     // Enforce the resource limits in the number of threads per block
-    assert(l2_size <= block_size_limit);
-    assert(l0_size <= ndevs);
+    assert(int(l2_size) <= block_size_limit);
+    assert(int(l0_size) <= ndevs);
 
     /* Merge blocks and devices */
     this->add_level({::std::make_pair(hw_scope::device, l0_size)});
diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh
index 2d987a10000..c443b5fea7c 100644
--- a/cudax/include/cuda/experimental/__stf/utility/traits.cuh
+++ b/cudax/include/cuda/experimental/__stf/utility/traits.cuh
@@ -138,6 +138,7 @@ class print_type_name_and_fail
 template <class T>
 class meyers_singleton
 {
+protected:
   template <class U>
   struct wrapper
   {
diff --git a/cudax/test/stf/examples/05-stencil-no-copy.cu b/cudax/test/stf/examples/05-stencil-no-copy.cu
index 4d6e4f850dc..b826fa601f1 100644
--- a/cudax/test/stf/examples/05-stencil-no-copy.cu
+++ b/cudax/test/stf/examples/05-stencil-no-copy.cu
@@ -210,13 +210,13 @@ int main(int argc, char** argv)
 
   for (int iter = 0; iter < NITER; iter++)
   {
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       update_halo(ctx, Un1[b], Un[(b - 1 + NBLOCKS) % NBLOCKS], Un[(b + 1) % NBLOCKS]);
     }
 
     // UPDATE Un from Un1
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       stencil(ctx, Un[b], Un1[b]);
     }
@@ -225,7 +225,7 @@ int main(int argc, char** argv)
     if (iter % 250 == 0)
     {
       double sum = 0.0;
-      for (int b = 0; b < NBLOCKS; b++)
+      for (size_t b = 0; b < NBLOCKS; b++)
       {
         sum += check_sum(ctx, Un[b]);
       }
@@ -233,7 +233,7 @@ int main(int argc, char** argv)
       // fprintf(stderr, "iter %d : CHECK SUM = %e\n", iter, sum);
     }
 
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       // Copy inner part of Un into Un1
       copy_inner(ctx, Un[b], Un1[b]);
@@ -242,7 +242,7 @@ int main(int argc, char** argv)
 
   // In this stencil, the sum of the elements is supposed to be a constant
   double sum = 0.0;
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
     sum += check_sum(ctx, Un[b]);
   }
diff --git a/cudax/test/stf/examples/05-stencil.cu b/cudax/test/stf/examples/05-stencil.cu
index 83ad14e2124..04f343a2f36 100644
--- a/cudax/test/stf/examples/05-stencil.cu
+++ b/cudax/test/stf/examples/05-stencil.cu
@@ -42,7 +42,7 @@ public:
     ctx.task(exec_place::host, handle.read())->*[&](cudaStream_t stream, auto sn) {
       cuda_safe_call(cudaStreamSynchronize(stream));
       const T* h_center = sn.data_handle();
-      for (int offset = ghost_size; offset < ghost_size + block_size; offset++)
+      for (size_t offset = ghost_size; offset < ghost_size + block_size; offset++)
       {
         sum += h_center[offset];
       }
@@ -145,8 +145,8 @@ void copy_array(data_block<T>& bn, data_block<T>& bn1)
 int main(int argc, char** argv)
 {
   int NITER      = 500;
-  int NBLOCKS    = 4;
-  int BLOCK_SIZE = 2048 * 1024;
+  size_t NBLOCKS    = 4;
+  size_t BLOCK_SIZE = 2048 * 1024;
 
   if (argc > 1)
   {
@@ -177,7 +177,7 @@ int main(int argc, char** argv)
   std::vector<data_block<double>> Un1;
 
   // Create blocks and allocates host data
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
     int beg = b * BLOCK_SIZE;
     int end = (b + 1) * BLOCK_SIZE;
@@ -186,7 +186,7 @@ int main(int argc, char** argv)
     Un1.emplace_back(beg, end, 1);
   }
 
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
     Un[b].preferred_device  = b % ndevs;
     Un1[b].preferred_device = b % ndevs;
@@ -195,7 +195,7 @@ int main(int argc, char** argv)
   // Fill blocks with initial values. For the sake of simplicity, we are
   // using a synchronization primitive and host code, but this could have
   // been written asynchronously using host callbacks.
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
     int beg = b * BLOCK_SIZE;
 
@@ -205,7 +205,7 @@ int main(int argc, char** argv)
       double* Un_vals  = sUn.data_handle();
       double* Un1_vals = sUn1.data_handle();
 
-      for (int local_idx = 0; local_idx < BLOCK_SIZE; local_idx++)
+      for (size_t local_idx = 0; local_idx < BLOCK_SIZE; local_idx++)
       {
         Un1_vals[local_idx + GHOST_SIZE] = U0[(beg + local_idx + TOTAL_SIZE) % TOTAL_SIZE];
       }
@@ -214,25 +214,25 @@ int main(int argc, char** argv)
 
   for (int iter = 0; iter < NITER; iter++)
   {
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       // Update the internal copies of the left and right boundaries
       update_inner_interfaces(Un1[b]);
     }
 
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       // Apply ghost cells from neighbours to put then in the "center" array
       update_outer_interfaces(Un1[b], Un1[(b - 1 + NBLOCKS) % NBLOCKS], Un1[(b + 1) % NBLOCKS]);
     }
 
     // UPDATE Un from Un1
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       stencil(Un[b], Un1[b]);
     }
 
-    for (int b = 0; b < NBLOCKS; b++)
+    for (size_t b = 0; b < NBLOCKS; b++)
     {
       // Save Un into Un1
       copy_array(Un[b], Un1[b]);
@@ -242,7 +242,7 @@ int main(int argc, char** argv)
     if (iter % 250 == 0)
     {
       double check_sum = 0.0;
-      for (int b = 0; b < NBLOCKS; b++)
+      for (size_t b = 0; b < NBLOCKS; b++)
       {
         check_sum += Un[b].check_sum();
       }
@@ -253,7 +253,7 @@ int main(int argc, char** argv)
 
   // In this stencil, the sum of the elements is supposed to be a constant
   double check_sum = 0.0;
-  for (int b = 0; b < NBLOCKS; b++)
+  for (size_t b = 0; b < NBLOCKS; b++)
   {
     check_sum += Un[b].check_sum();
   }
diff --git a/cudax/test/stf/examples/05-stencil2d-places.cu b/cudax/test/stf/examples/05-stencil2d-places.cu
index 828dbacac9c..2fde20dd7b9 100644
--- a/cudax/test/stf/examples/05-stencil2d-places.cu
+++ b/cudax/test/stf/examples/05-stencil2d-places.cu
@@ -74,7 +74,7 @@ int main(int argc, char** argv)
   // TODO implement the proper tiled_partitioning along y !
   data_place cdp = data_place::composite(tiled_partition<128>(), all_devs);
 
-  for (int iter = 0; iter < NITER; iter++)
+  for (size_t iter = 0; iter < NITER; iter++)
   {
     // UPDATE Un from Un1
     ctx.task(lUn.rw(cdp), lUn1.read(cdp))->*[&](auto stream, auto sUn, auto sUn1) {
@@ -99,7 +99,7 @@ int main(int argc, char** argv)
         if (vtk_dump)
         {
           char str[32];
-          snprintf(str, 32, "Un_%05d.vtk", iter);
+          snprintf(str, 32, "Un_%05zu.vtk", iter);
           mdspan_to_vtk(sUn, std::string(str));
         }
       };
diff --git a/cudax/test/stf/freeze/constant_logical_data.cu b/cudax/test/stf/freeze/constant_logical_data.cu
index 716607b86fb..0f410a1322b 100644
--- a/cudax/test/stf/freeze/constant_logical_data.cu
+++ b/cudax/test/stf/freeze/constant_logical_data.cu
@@ -65,7 +65,7 @@ int main()
 
   ctx.finalize();
 
-  for (int i = 0; i < N; i++)
+  for (size_t i = 0; i < N; i++)
   {
     EXPECT(X[i] == (5 * i - 3) + 4 * (18 * i - 9) + 4 * (4 * i - 2));
   }
diff --git a/cudax/test/stf/freeze/task_fence.cu b/cudax/test/stf/freeze/task_fence.cu
index 04898c7d7c4..9aa5716aa56 100644
--- a/cudax/test/stf/freeze/task_fence.cu
+++ b/cudax/test/stf/freeze/task_fence.cu
@@ -93,7 +93,7 @@ int main()
 
   ctx.finalize();
 
-  for (int i = 0; i < N; i++)
+  for (size_t i = 0; i < N; i++)
   {
     EXPECT(X[i] == 2 * X0(i) + 1);
   }
diff --git a/cudax/test/stf/graph/for_each_batched.cu b/cudax/test/stf/graph/for_each_batched.cu
index d8714a6b0e6..a640fcc9448 100644
--- a/cudax/test/stf/graph/for_each_batched.cu
+++ b/cudax/test/stf/graph/for_each_batched.cu
@@ -88,15 +88,15 @@ int main()
 
   stream_ctx ctx;
 
-  size_t N = 256 * 1024;
-  size_t K = 8;
+  const size_t N = 256 * 1024;
+  const size_t K = 8;
 
   size_t BATCH_SIZE = 4;
 
   logical_data<slice<double>> lX[K];
   logical_data<slice<double>> lY[K];
 
-  for (int i = 0; i < K; i++)
+  for (size_t i = 0; i < K; i++)
   {
     lX[i] = ctx.logical_data<double>(N);
     lY[i] = ctx.logical_data<double>(N);
diff --git a/cudax/test/stf/graph/for_each_batched_write.cu b/cudax/test/stf/graph/for_each_batched_write.cu
index 7ab09735988..2d044d5f678 100644
--- a/cudax/test/stf/graph/for_each_batched_write.cu
+++ b/cudax/test/stf/graph/for_each_batched_write.cu
@@ -35,15 +35,15 @@ int main()
 {
   stream_ctx ctx;
 
-  size_t N = 256 * 1024;
-  size_t K = 8;
+  const size_t N = 256 * 1024;
+  const size_t K = 8;
 
-  size_t BATCH_SIZE = 4;
+  const size_t BATCH_SIZE = 4;
 
   logical_data<slice<double>> lX[K];
   logical_data<slice<double>> lY[K];
 
-  for (int i = 0; i < K; i++)
+  for (size_t i = 0; i < K; i++)
   {
     lX[i] = ctx.logical_data<double>(N);
     lX[i].set_symbol("x" + std::to_string(i));
@@ -52,7 +52,7 @@ int main()
     };
   }
 
-  for (int i = 0; i < K; i++)
+  for (size_t i = 0; i < K; i++)
   {
     // NOT INITIALIZED
     lY[i] = ctx.logical_data<double>(N);
@@ -70,7 +70,7 @@ int main()
             COPY(inner_ctx, lxi, lyi);
           };
 
-  for (int i = 0; i < K; i++)
+  for (size_t i = 0; i < K; i++)
   {
     // TODO check actual content
     ctx.task(lY[i].read()).set_symbol("CHECK")->*[](cudaStream_t, auto) {};
diff --git a/cudax/test/stf/graph/freeze_for_graph.cu b/cudax/test/stf/graph/freeze_for_graph.cu
index 068f23b7862..f445a89fda1 100644
--- a/cudax/test/stf/graph/freeze_for_graph.cu
+++ b/cudax/test/stf/graph/freeze_for_graph.cu
@@ -64,7 +64,7 @@ int main()
   fX.unfreeze(stream);
 
   ctx.host_launch(lX.read())->*[](auto x) {
-    for (int i = 0; i < x.size(); i++)
+    for (size_t i = 0; i < x.size(); i++)
     {
       EXPECT(x(i) == X0(i) + 2);
     }
diff --git a/cudax/test/stf/graph/graph_composition.cu b/cudax/test/stf/graph/graph_composition.cu
index 45d298d8bc1..91985720217 100644
--- a/cudax/test/stf/graph/graph_composition.cu
+++ b/cudax/test/stf/graph/graph_composition.cu
@@ -87,8 +87,8 @@ int main()
 
   stream_ctx ctx;
 
-  size_t N = 256 * 1024;
-  size_t K = 8;
+  const size_t N = 256 * 1024;
+  const size_t K = 8;
 
   logical_data<slice<double>> lX[K];
   logical_data<slice<double>> lY[K];
diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu
index 8e81f1ca6b3..06322b5343f 100644
--- a/cudax/test/stf/green_context/gc_grid.cu
+++ b/cudax/test/stf/green_context/gc_grid.cu
@@ -89,7 +89,7 @@ int main()
 
     auto& g_ctx = gc[devid];
     auto cnt    = g_ctx.get_count();
-    for (int i = 0; i < cnt; i++)
+    for (size_t i = 0; i < cnt; i++)
     {
       places.push_back(exec_place::green_ctx(g_ctx.get_view(i)));
     }

From 15f219752b43ee380f564e6c65b9b09274b108b7 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 01:02:46 +0200
Subject: [PATCH 05/19] Fix sizeness, constness and VLA issues

---
 cudax/examples/stf/1f1b.cu                        |  4 ++--
 cudax/examples/stf/binary_fhe.cu                  |  2 +-
 cudax/examples/stf/graph_algorithms/pagerank.cu   |  4 ++--
 cudax/examples/stf/mandelbrot.cu                  |  4 ++--
 cudax/test/stf/interface/data_from_device.cu      |  2 +-
 cudax/test/stf/interface/data_from_device_2.cu    |  2 +-
 cudax/test/stf/interface/data_from_device_wb.cu   |  2 +-
 cudax/test/stf/interface/graph_use_device_data.cu |  4 ++--
 cudax/test/stf/reclaiming/graph.cu                |  2 +-
 cudax/test/stf/reclaiming/graph_2.cu              |  2 +-
 cudax/test/stf/reclaiming/stream.cu               |  2 +-
 cudax/test/stf/stencil/stencil-1D.cu              | 14 +++++++-------
 cudax/test/stf/stress/empty_tasks_alloc.cu        |  2 +-
 cudax/test/stf/stress/task_bench.cu               |  2 +-
 14 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/cudax/examples/stf/1f1b.cu b/cudax/examples/stf/1f1b.cu
index 2add7930d79..7f3ef24ef89 100644
--- a/cudax/examples/stf/1f1b.cu
+++ b/cudax/examples/stf/1f1b.cu
@@ -81,7 +81,7 @@ int main(int argc, char** argv) {
     for (size_t iter = 0; iter < niter; iter++) {
         size_t task_cnt = 0;
         for (size_t b = 0; b < num_batches; b++) {
-            for (size_t d = 0; d < num_devs; d++) {
+            for (int d = 0; d < num_devs; d++) {
                 ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) {
                     int ms = 10;
                     long long int clock_cnt = (long long int) (ms * clock_rate / factor);
@@ -92,7 +92,7 @@ int main(int argc, char** argv) {
             //        }
             //
             //        for (size_t b = 0; b < num_batches; b++) {
-            for (size_t d = num_devs; d-- > 0;) {
+            for (int d = num_devs; d-- > 0;) {
                 ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) {
                     int ms = 20;
                     long long int clock_cnt = (long long int) (ms * clock_rate / factor);
diff --git a/cudax/examples/stf/binary_fhe.cu b/cudax/examples/stf/binary_fhe.cu
index 2898d15119a..1e0f5c226c1 100644
--- a/cudax/examples/stf/binary_fhe.cu
+++ b/cudax/examples/stf/binary_fhe.cu
@@ -24,7 +24,7 @@ class plaintext {
 public:
     plaintext(const context& ctx) : ctx(ctx) {}
 
-    plaintext(context& ctx, std::vector<char> v) : ctx(ctx), values(v) {
+    plaintext(context& ctx, std::vector<char> v) : values(v), ctx(ctx) {
         l = ctx.logical_data(&values[0], values.size());
     }
 
diff --git a/cudax/examples/stf/graph_algorithms/pagerank.cu b/cudax/examples/stf/graph_algorithms/pagerank.cu
index 521851acfd8..fa66658d84f 100644
--- a/cudax/examples/stf/graph_algorithms/pagerank.cu
+++ b/cudax/examples/stf/graph_algorithms/pagerank.cu
@@ -124,8 +124,8 @@ int main() {
     printf("Page rank answer is %s.\n", abs(sum_pageranks - 1.0) < 0.001 ? "correct" : "not correct");
 
     printf("PageRank Results:\n");
-    for (int i = 0; i < page_rank.size(); ++i) {
-        printf("Vertex %d: %f\n", i, page_rank[i]);
+    for (size_t i = 0; i < page_rank.size(); ++i) {
+        printf("Vertex %ld: %f\n", i, page_rank[i]);
     }
 
     return 0;
diff --git a/cudax/examples/stf/mandelbrot.cu b/cudax/examples/stf/mandelbrot.cu
index d0cbd2665e0..8d0fb64920d 100644
--- a/cudax/examples/stf/mandelbrot.cu
+++ b/cudax/examples/stf/mandelbrot.cu
@@ -95,8 +95,8 @@ int main(int argc, char** argv) {
             imageFile << width << " " << height << "\n";
             imageFile << "255\n";
 
-            for (int y = 0; y < height; y++)
-                for (int x = 0; x < width; x++) {
+            for (size_t y = 0; y < height; y++)
+                for (size_t x = 0; x < width; x++) {
                     int iterations = buffer(x, y);
                     // Convert iterations to RGB values
                     unsigned char r = (iterations % 8) * 32;
diff --git a/cudax/test/stf/interface/data_from_device.cu b/cudax/test/stf/interface/data_from_device.cu
index 1eb8b0bb20f..69ccbc74666 100644
--- a/cudax/test/stf/interface/data_from_device.cu
+++ b/cudax/test/stf/interface/data_from_device.cu
@@ -63,7 +63,7 @@ void run()
 
   // Access Ask to use X, Y and Z on the host
   ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto X, auto Y) {
-    for (int ind = 0; ind < n; ind++)
+    for (size_t ind = 0; ind < n; ind++)
     {
       // X unchanged
       EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001);
diff --git a/cudax/test/stf/interface/data_from_device_2.cu b/cudax/test/stf/interface/data_from_device_2.cu
index 71cacbda7e0..74d5284e988 100644
--- a/cudax/test/stf/interface/data_from_device_2.cu
+++ b/cudax/test/stf/interface/data_from_device_2.cu
@@ -76,7 +76,7 @@ void run()
 
   // Access Ask to use X, Y and Z on the host
   ctx.host_launch(handle_X.read(), handle_Y.read(), handle_Z.read())->*[&](auto X, auto Y, auto Z) {
-    for (int ind = 0; ind < n; ind++)
+    for (size_t ind = 0; ind < n; ind++)
     {
       // X unchanged
       EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001);
diff --git a/cudax/test/stf/interface/data_from_device_wb.cu b/cudax/test/stf/interface/data_from_device_wb.cu
index fae695109b0..be7ff580582 100644
--- a/cudax/test/stf/interface/data_from_device_wb.cu
+++ b/cudax/test/stf/interface/data_from_device_wb.cu
@@ -59,7 +59,7 @@ void run()
   };
 
   ctx.host_launch(handle_X.rw())->*[&](auto X) {
-    for (int ind = 0; ind < n; ind++)
+    for (size_t ind = 0; ind < n; ind++)
     {
       X(ind) = 2.0 * X(ind) + 1.0;
     }
diff --git a/cudax/test/stf/interface/graph_use_device_data.cu b/cudax/test/stf/interface/graph_use_device_data.cu
index 8117e0ce750..25af47d0af3 100644
--- a/cudax/test/stf/interface/graph_use_device_data.cu
+++ b/cudax/test/stf/interface/graph_use_device_data.cu
@@ -42,7 +42,7 @@ __global__ void setup_vectors(int n, T* x, T* y, T* z)
 int main(int argc, char** argv)
 {
   graph_ctx ctx;
-  size_t n           = 12;
+  const size_t n           = 12;
   const double alpha = 2.0;
 
   double *dX, *dY, *dZ;
@@ -77,7 +77,7 @@ int main(int argc, char** argv)
   };
 
   ctx.host_launch(handle_X.read(), handle_Y.read(), handle_Z.read())->*[&](auto hX, auto hY, auto hZ) {
-    for (int ind = 0; ind < n; ind++)
+    for (size_t ind = 0; ind < n; ind++)
     {
       // X unchanged
       EXPECT(fabs(hX(ind) - 1.0 * ind) < 0.00001);
diff --git a/cudax/test/stf/reclaiming/graph.cu b/cudax/test/stf/reclaiming/graph.cu
index 553b6434a6e..8d4973fe092 100644
--- a/cudax/test/stf/reclaiming/graph.cu
+++ b/cudax/test/stf/reclaiming/graph.cu
@@ -37,7 +37,7 @@ int main(int argc, char** argv)
 
   graph_ctx ctx;
 
-  logical_data<slice<char>> handles[nblocks];
+  ::std::vector<logical_data<slice<char>>> handles(nblocks);
 
   char* h_buffer = new char[nblocks * block_size];
 
diff --git a/cudax/test/stf/reclaiming/graph_2.cu b/cudax/test/stf/reclaiming/graph_2.cu
index 5778172f30f..9099a7be571 100644
--- a/cudax/test/stf/reclaiming/graph_2.cu
+++ b/cudax/test/stf/reclaiming/graph_2.cu
@@ -36,7 +36,7 @@ int main(int argc, char** argv)
   setenv("MAX_ALLOC_CNT", "2", 1);
 
   graph_ctx ctx;
-  logical_data<slice<char>> handles[nblocks];
+  ::std::vector<logical_data<slice<char>>> handles(nblocks);
   std::vector<char> h_buffer(nblocks * block_size);
 
   for (int i = 0; i < nblocks; i++)
diff --git a/cudax/test/stf/reclaiming/stream.cu b/cudax/test/stf/reclaiming/stream.cu
index a2d6dac67c6..b05c6f73722 100644
--- a/cudax/test/stf/reclaiming/stream.cu
+++ b/cudax/test/stf/reclaiming/stream.cu
@@ -113,7 +113,7 @@ int main(int argc, char** argv)
   auto dummy_alloc = block_allocator<one_block_allocator>(ctx);
   ctx.set_allocator(dummy_alloc);
 
-  logical_data<slice<char>> handles[nblocks];
+  ::std::vector<logical_data<slice<char>>> handles(nblocks);
 
   char* h_buffer = new char[nblocks * block_size];
 
diff --git a/cudax/test/stf/stencil/stencil-1D.cu b/cudax/test/stf/stencil/stencil-1D.cu
index a7f975f4cae..843a1fb13d9 100644
--- a/cudax/test/stf/stencil/stencil-1D.cu
+++ b/cudax/test/stf/stencil/stencil-1D.cu
@@ -28,10 +28,10 @@ public:
       : ghost_size(ghost_size)
       , block_size(end - beg + 1)
       , array(new T[block_size + 2 * ghost_size])
-      , handle(ctx.logical_data(array.get(), block_size + 2 * ghost_size))
       , left_interface(new T[ghost_size])
-      , left_handle(ctx.logical_data(left_interface.get(), ghost_size))
       , right_interface(new T[ghost_size])
+      , handle(ctx.logical_data(array.get(), block_size + 2 * ghost_size))
+      , left_handle(ctx.logical_data(left_interface.get(), ghost_size))
       , right_handle(ctx.logical_data(right_interface.get(), ghost_size))
   {}
 
@@ -46,8 +46,8 @@ public:
   }
 
 public:
-  size_t block_size;
   size_t ghost_size;
+  size_t block_size;
   std::unique_ptr<T[]> array;
   std::unique_ptr<T[]> left_interface;
   std::unique_ptr<T[]> right_interface;
@@ -55,11 +55,11 @@ public:
   // HANDLE = whole data + boundaries
   logical_data<slice<T>> handle;
 
-  // A piece of data to store the right part of the block
-  logical_data<slice<T>> right_handle;
-
   // A piece of data to store the left part of the block
   logical_data<slice<T>> left_handle;
+
+  // A piece of data to store the right part of the block
+  logical_data<slice<T>> right_handle;
 };
 
 template <typename T>
@@ -173,7 +173,7 @@ int main(int argc, char** argv)
   // Create the graph - it starts out empty
 
   int NITER = 400;
-  for (size_t iter = 0; iter < NITER; iter++)
+  for (int iter = 0; iter < NITER; iter++)
   {
     // UPDATE Un from Un1
     for (size_t b = 0; b < NBLOCKS; b++)
diff --git a/cudax/test/stf/stress/empty_tasks_alloc.cu b/cudax/test/stf/stress/empty_tasks_alloc.cu
index 2a43ec9b098..7bfeae653e1 100644
--- a/cudax/test/stf/stress/empty_tasks_alloc.cu
+++ b/cudax/test/stf/stress/empty_tasks_alloc.cu
@@ -30,7 +30,7 @@ int main(int argc, char** argv)
 
   std::chrono::steady_clock::time_point start, stop;
   start = std::chrono::steady_clock::now();
-  for (size_t iter = 0; iter < iter_cnt; iter++)
+  for (int iter = 0; iter < iter_cnt; iter++)
   {
     auto lX = ctx.logical_data(shape_of<slice<double>>(N));
     auto lY = ctx.logical_data(shape_of<slice<double>>(N));
diff --git a/cudax/test/stf/stress/task_bench.cu b/cudax/test/stf/stress/task_bench.cu
index bc43357ac4c..f1cc20d699a 100644
--- a/cudax/test/stf/stress/task_bench.cu
+++ b/cudax/test/stf/stress/task_bench.cu
@@ -174,7 +174,7 @@ void bench(context& ctx, test_id id, size_t width, size_t nsteps, size_t repeat_
 {
   std::chrono::steady_clock::time_point start, stop;
 
-  const int b = nsteps;
+  const size_t b = nsteps;
 
   std::vector<logical_data<slice<int>>> data(width * b);
   const size_t data_size = 128;

From be00476ce5dc979ddf8932d2513f391273a681ac Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 09:41:26 +0200
Subject: [PATCH 06/19] Solve an unused variable issue due to constexpr

---
 .../__stf/internal/backend_ctx.cuh            | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index 05701dd78f1..6ab684fb96a 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -281,17 +281,6 @@ public:
       dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
     }
 
-    auto insert_one_kernel = [](cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) {
-      cudaKernelNodeParams kconfig;
-      kconfig.blockDim       = k.blockDim;
-      kconfig.extra          = nullptr;
-      kconfig.func           = (void*) k.func;
-      kconfig.gridDim        = k.gridDim;
-      kconfig.kernelParams   = k.args_ptr.data();
-      kconfig.sharedMemBytes = k.sharedMem;
-      cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
-    };
-
     // When chained is enable, we expect a vector of kernel description which should be executed one after the other
     if constexpr (chained)
     {
@@ -362,6 +351,18 @@ public:
   }
 
 private:
+  /* Add a kernel to a CUDA graph given its description */
+  auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const {
+    cudaKernelNodeParams kconfig;
+    kconfig.blockDim       = k.blockDim;
+    kconfig.extra          = nullptr;
+    kconfig.func           = const_cast<void *>(k.func);
+    kconfig.gridDim        = k.gridDim;
+    kconfig.kernelParams   = k.args_ptr.data();
+    kconfig.sharedMemBytes = k.sharedMem;
+    cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
+  }
+
   ::std::string symbol;
   Ctx& ctx;
   task_dep_vector<Deps...> deps;

From f07a1a1d2154c0ac15a6d9817f0697f014fb106a Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 09:41:50 +0200
Subject: [PATCH 07/19] fix constness issue in logical_data arguments

---
 cudax/examples/stf/word_count.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/examples/stf/word_count.cu b/cudax/examples/stf/word_count.cu
index 32be1d9e1fd..89b827c2ef9 100644
--- a/cudax/examples/stf/word_count.cu
+++ b/cudax/examples/stf/word_count.cu
@@ -40,7 +40,7 @@ int main() {
 
     context ctx;
 
-    auto ltext = ctx.logical_data((char*) &raw_input[0], { sizeof(raw_input) });
+    auto ltext = ctx.logical_data(const_cast<char*>(&raw_input[0]), { sizeof(raw_input) });
 
     int cnt = 0;
     auto lcnt = ctx.logical_data(&cnt, { 1 });

From 78cabc1eb265573b9ebc86824b4b35fcf6aa6c1e Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 09:45:47 +0200
Subject: [PATCH 08/19] Solve a parsing error in g++

---
 cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
index 6ab684fb96a..ccc33258347 100644
--- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -1069,7 +1069,7 @@ private:
   template <typename T, typename... P>
   auto make_data_interface(P&&... p)
   {
-    return ::std::make_shared<typename Engine::data_interface<T>>(::std::forward<P>(p)...);
+    return ::std::make_shared<typename Engine::template data_interface<T>>(::std::forward<P>(p)...);
   }
 };
 

From 87f8a91dd3340fb1cb16b94a51c824fab7f0636c Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 10:47:18 +0200
Subject: [PATCH 09/19] Fix the equality operator of the exec_place_grid to
 avoid ambiguities in C++20

---
 .../cuda/experimental/__stf/places/places.cuh | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index 5f0914a9922..d2e146f101d 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -804,12 +804,28 @@ public:
 
     bool operator==(const exec_place::impl& rhs) const override
     {
-      if (!exec_place::impl::operator==(rhs))
+      // First, check if rhs is of type exec_place_grid::impl
+      auto other = dynamic_cast<const impl*>(&rhs);
+      if (!other)
       {
-        return false;
+          return false;  // rhs is not a grid, so they are not equal
       }
-      auto other = dynamic_cast<const impl*>(&rhs);
-      return other && dims == other->dims && places == other->places;
+
+      // Compare two grids
+      return *this == *other;
+    }
+
+    // Compare two grids
+    bool operator==(const impl& rhs) const
+    {
+        // First, compare base class properties
+        if (!exec_place::impl::operator==(rhs))
+        {
+            return false;
+        }
+
+        // Compare grid-specific properties
+        return dims == rhs.dims && places == rhs.places;
     }
 
     const ::std::vector<exec_place>& get_places() const

From 86af6d491b39f044971b769228ae225ea47fd24d Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 11:06:14 +0200
Subject: [PATCH 10/19] avoid a dangling reference warning

---
 .../include/cuda/experimental/__stf/stream/stream_task.cuh  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
index b3fcb4b72c1..28fa0ec6028 100644
--- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -111,7 +111,11 @@ public:
       // We have currently no way to pass an array of per-place streams
       assert(automatic_stream);
 
-      const auto& places = e_place.as_grid().get_places();
+      // Note: we store grid in a variable to avoid dangling references
+      // because the compiler does not know we are making a refernce to
+      // a vector that remains valid
+      const auto& grid = e_place.as_grid();
+      const auto& places = grid.get_places();
       for (const exec_place& p : places)
       {
         stream_grid.push_back(get_stream_from_pool(p));

From ac389ce44179356026ea03c68ba566d0f12039a6 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 11:23:20 +0200
Subject: [PATCH 11/19] Add a missing header

---
 .../cuda/experimental/__stf/places/exec/host/callback_queues.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
index 88c1a04dca5..e38dcde77b8 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
@@ -18,6 +18,7 @@
 #include <cuda/experimental/__stf/places/exec/host/callback_queues.cuh>
 #include <cuda/experimental/__stf/utility/traits.cuh>
 #include <stack>
+#include <cstdio>
 
 #define STATEFUL_CALLBACKS
 

From 6107281f08bbc091ebdf1cf554b2b46fca858c50 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 11:49:50 +0200
Subject: [PATCH 12/19] Make the unique id mutable to make it possible to have
 a default constructor

---
 cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
index 9063f4ead1a..49c3dbcf7d7 100644
--- a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
@@ -149,7 +149,7 @@ public:
   }
 
   /// A unique identifier for the event, used to ensure proper event ordering.
-  const unique_id_t unique_prereq_id;
+  mutable unique_id_t unique_prereq_id;
 
   ::std::atomic<int> outbound_deps = 0;
 

From 72887f5e99b3ac702784907477ce65696b0e2794 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:00:12 +0200
Subject: [PATCH 13/19] Add mutable to allow a default ctor

---
 .../cuda/experimental/__stf/graph/internal/event_types.cuh    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh
index 6aa7d127e3e..f017f24ff47 100644
--- a/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh
+++ b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh
@@ -34,8 +34,8 @@ protected:
   }
 
 public:
-  const cudaGraphNode_t node;
-  size_t epoch;
+  mutable cudaGraphNode_t node;
+  mutable size_t epoch;
 };
 
 using graph_event = handle<graph_event_impl, handle_flags::non_null>;

From 81e909089fd3b19ce52ea49371b41757ad849d33 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:00:38 +0200
Subject: [PATCH 14/19] remove unused variable

---
 .../include/cuda/experimental/__stf/internal/logical_data.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
index 824f8fe9665..05724ab70d2 100644
--- a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
@@ -575,7 +575,7 @@ public:
     auto& current_instance = get_data_instance(instance_id);
     auto current_state     = current_instance.get_msir();
 
-    static size_t total_write_back_cnt = 0;
+//    static size_t total_write_back_cnt = 0;
 
     /* Update MSI status depending on the current states and the required access mode */
     switch (current_state)
@@ -597,7 +597,7 @@ public:
         prereqs.merge(ref_instance.get_read_prereq(), current_instance.get_read_prereq());
 
         write_back(memory_node, instance_id, prereqs);
-        total_write_back_cnt++;
+        // total_write_back_cnt++;
         // fprintf(stderr, "WRITE BACK... %s (%ld)!!\n", get_symbol().c_str(), total_write_back_cnt);
 
         ref_instance.add_read_prereq(prereqs);

From 907a4cbcf8128b9b4e1b368577ff6c08369d4467 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:01:15 +0200
Subject: [PATCH 15/19] Avoid a GNU specific syntax by providing a ctor for
 metadata

---
 .../experimental/__stf/localization/composite_slice.cuh    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
index c0115ffc275..3ec4f6a2408 100644
--- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
@@ -36,6 +36,8 @@ class localized_array
 {
   struct metadata
   {
+    metadata(int dev_, size_t size_, size_t offset_) : alloc_handle{}, dev(dev_), size(size_), offset(offset_) {}
+
     CUmemGenericAllocationHandle alloc_handle;
     int dev;
     size_t size;
@@ -122,10 +124,7 @@ public:
         j++;
       }
 
-      meta.push_back({.alloc_handle = {},
-                      .dev          = grid_pos_to_dev(p),
-                      .size         = j * alloc_granularity_bytes,
-                      .offset       = i * block_size_bytes});
+      meta.emplace_back(grid_pos_to_dev(p), j * alloc_granularity_bytes, i * block_size_bytes);
 
       i += j;
     }

From 255c8d4b06133b592f28a8f60bdfd357aaa4cb59 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:01:43 +0200
Subject: [PATCH 16/19] add missing override keyword and remove extra virtual

---
 .../cuda/experimental/__stf/places/exec/green_context.cuh     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
index 29623bed7d9..9f9ab3bdb65 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -217,13 +217,13 @@ public:
       cuda_safe_call(cuCtxSetCurrent(saved_ctx));
     }
 
-    virtual ::std::string to_string() const override
+    ::std::string to_string() const override
     {
       return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid)
            + ")";
     }
 
-    virtual stream_pool& get_stream_pool(async_resources_handle&, bool) const
+    stream_pool& get_stream_pool(async_resources_handle&, bool) const override
     {
       return *pool;
     }

From f7608d0908a553975eb7751342f256b016c80869 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:01:58 +0200
Subject: [PATCH 17/19] default exec_place_grid ctor

---
 cudax/include/cuda/experimental/__stf/places/places.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index d2e146f101d..8b9b740f545 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -1025,6 +1025,9 @@ public:
     return ::std::static_pointer_cast<impl>(exec_place::get_impl());
   }
 
+  // Default constructor
+  exec_place_grid() : exec_place(nullptr) {}
+
   // private:
   exec_place_grid(::std::shared_ptr<impl> p)
       : exec_place(mv(p))

From b9a2617b18d7979ff1b463f133884cfd70bbe2a5 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:17:46 +0200
Subject: [PATCH 18/19] Remove useless const keywords

---
 .../cuda/experimental/__stf/places/blocked_partition.cuh      | 2 +-
 cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh
index 27a0e5e198b..8f1c042e885 100644
--- a/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh
@@ -64,7 +64,7 @@ public:
   }
 
   template <typename mdspan_shape_t>
-  CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
+  CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
   {
     constexpr size_t dimensions = mdspan_shape_t::rank();
 
diff --git a/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh
index 9e8e850d051..98134e69349 100644
--- a/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh
@@ -202,7 +202,7 @@ public:
   cyclic_partition() = default;
 
   template <size_t dimensions>
-  CUDASTF_HOST_DEVICE static const auto apply(const box<dimensions>& in, pos4 place_position, dim4 grid_dims)
+  CUDASTF_HOST_DEVICE static auto apply(const box<dimensions>& in, pos4 place_position, dim4 grid_dims)
   {
     ::std::array<size_t, dimensions> begins;
     ::std::array<size_t, dimensions> ends;
@@ -218,7 +218,7 @@ public:
   }
 
   template <typename mdspan_shape_t>
-  CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
+  CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
   {
     constexpr size_t dimensions = mdspan_shape_t::rank();
 

From 64f7b93cd28a396053794933858dcb7de1e0fa04 Mon Sep 17 00:00:00 2001
From: Cedric Augonnet <caugonnet@nvidia.com>
Date: Fri, 11 Oct 2024 13:22:57 +0200
Subject: [PATCH 19/19] Revert change commited by mistake

---
 .devcontainer/cccl-entrypoint.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.devcontainer/cccl-entrypoint.sh b/.devcontainer/cccl-entrypoint.sh
index de42a5d5666..79cd453c771 100755
--- a/.devcontainer/cccl-entrypoint.sh
+++ b/.devcontainer/cccl-entrypoint.sh
@@ -8,10 +8,7 @@ devcontainer-utils-post-create-command;
 devcontainer-utils-init-git;
 devcontainer-utils-post-attach-command;
 
-# cd /home/coder/cccl/
-echo "CLANG .."
-clang-format -i cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
-clang-format --version
+cd /home/coder/cccl/
 
 if test $# -gt 0; then
     exec "$@";