alliepiper · alliepiper · Oct 11, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake
@@ -57,9 +57,9 @@ function(cudax_add_header_test label definitions)
         # FIXME: layout_left::mapping referenced before layout_left:
         cuda/experimental/__stf/supplemental_std_experimental/__p0009_bits/layout_left.hpp
 
-        # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
-        cuda/experimental/__stf/stream/stream_task.cuh
-        cuda/experimental/__stf/stream/stream_ctx.cuh
+        # # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114)
+        # cuda/experimental/__stf/stream/stream_task.cuh
+        # cuda/experimental/__stf/stream/stream_ctx.cuh
     )
     target_link_libraries(${headertest_target} PUBLIC ${cn_target})
     target_compile_definitions(${headertest_target} PRIVATE

diff --git a/cudax/examples/stf/1f1b.cu b/cudax/examples/stf/1f1b.cu
@@ -81,7 +81,7 @@ int main(int argc, char** argv) {
     for (size_t iter = 0; iter < niter; iter++) {
         size_t task_cnt = 0;
         for (size_t b = 0; b < num_batches; b++) {
-            for (size_t d = 0; d < num_devs; d++) {
+            for (int d = 0; d < num_devs; d++) {
                 ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) {
                     int ms = 10;
                     long long int clock_cnt = (long long int) (ms * clock_rate / factor);
@@ -92,7 +92,7 @@ int main(int argc, char** argv) {
             //        }
             //
             //        for (size_t b = 0; b < num_batches; b++) {
-            for (size_t d = num_devs; d-- > 0;) {
+            for (int d = num_devs; d-- > 0;) {
                 ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) {
                     int ms = 20;
                     long long int clock_cnt = (long long int) (ms * clock_rate / factor);

diff --git a/cudax/examples/stf/binary_fhe.cu b/cudax/examples/stf/binary_fhe.cu
@@ -24,7 +24,7 @@ class plaintext {
 public:
     plaintext(const context& ctx) : ctx(ctx) {}
 
-    plaintext(context& ctx, std::vector<char> v) : ctx(ctx), values(v) {
+    plaintext(context& ctx, std::vector<char> v) : values(v), ctx(ctx) {
         l = ctx.logical_data(&values[0], values.size());
     }
 

diff --git a/cudax/examples/stf/graph_algorithms/pagerank.cu b/cudax/examples/stf/graph_algorithms/pagerank.cu
@@ -124,8 +124,8 @@ int main() {
     printf("Page rank answer is %s.\n", abs(sum_pageranks - 1.0) < 0.001 ? "correct" : "not correct");
 
     printf("PageRank Results:\n");
-    for (int i = 0; i < page_rank.size(); ++i) {
-        printf("Vertex %d: %f\n", i, page_rank[i]);
+    for (size_t i = 0; i < page_rank.size(); ++i) {
+        printf("Vertex %ld: %f\n", i, page_rank[i]);
     }
 
     return 0;

diff --git a/cudax/examples/stf/mandelbrot.cu b/cudax/examples/stf/mandelbrot.cu
@@ -95,8 +95,8 @@ int main(int argc, char** argv) {
             imageFile << width << " " << height << "\n";
             imageFile << "255\n";
 
-            for (int y = 0; y < height; y++)
-                for (int x = 0; x < width; x++) {
+            for (size_t y = 0; y < height; y++)
+                for (size_t x = 0; x < width; x++) {
                     int iterations = buffer(x, y);
                     // Convert iterations to RGB values
                     unsigned char r = (iterations % 8) * 32;

diff --git a/cudax/examples/stf/word_count.cu b/cudax/examples/stf/word_count.cu
@@ -40,7 +40,7 @@ int main() {
 
     context ctx;
 
-    auto ltext = ctx.logical_data((char*) &raw_input[0], { sizeof(raw_input) });
+    auto ltext = ctx.logical_data(const_cast<char*>(&raw_input[0]), { sizeof(raw_input) });
 
     int cnt = 0;
     auto lcnt = ctx.logical_data(&cnt, { 1 });

diff --git a/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh
@@ -34,8 +34,8 @@ protected:
   }
 
 public:
-  const cudaGraphNode_t node;
-  size_t epoch;
+  mutable cudaGraphNode_t node;
+  mutable size_t epoch;
 };
 
 using graph_event = handle<graph_event_impl, handle_flags::non_null>;

diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh
@@ -149,7 +149,7 @@ public:
   }
 
   /// A unique identifier for the event, used to ensure proper event ordering.
-  const unique_id_t unique_prereq_id;
+  mutable unique_id_t unique_prereq_id;
 
   ::std::atomic<int> outbound_deps = 0;
 

diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh
@@ -281,17 +281,6 @@ public:
       dot.template add_vertex<typename Ctx::task_type, logical_data_untyped>(t);
     }
 
-    auto insert_one_kernel = [](cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) {
-      cudaKernelNodeParams kconfig;
-      kconfig.blockDim       = k.blockDim;
-      kconfig.extra          = nullptr;
-      kconfig.func           = (void*) k.func;
-      kconfig.gridDim        = k.gridDim;
-      kconfig.kernelParams   = k.args_ptr.data();
-      kconfig.sharedMemBytes = k.sharedMem;
-      cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
-    };
-
     // When chained is enable, we expect a vector of kernel description which should be executed one after the other
     if constexpr (chained)
     {
@@ -362,6 +351,18 @@ public:
   }
 
 private:
+  /* Add a kernel to a CUDA graph given its description */
+  auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const {
+    cudaKernelNodeParams kconfig;
+    kconfig.blockDim       = k.blockDim;
+    kconfig.extra          = nullptr;
+    kconfig.func           = const_cast<void *>(k.func);
+    kconfig.gridDim        = k.gridDim;
+    kconfig.kernelParams   = k.args_ptr.data();
+    kconfig.sharedMemBytes = k.sharedMem;
+    cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
+  }
+
   ::std::string symbol;
   Ctx& ctx;
   task_dep_vector<Deps...> deps;
@@ -1068,7 +1069,7 @@ private:
   template <typename T, typename... P>
   auto make_data_interface(P&&... p)
   {
-    return ::std::make_shared<typename Engine::data_interface<T>>(::std::forward<P>(p)...);
+    return ::std::make_shared<typename Engine::template data_interface<T>>(::std::forward<P>(p)...);
   }
 };
 

diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh
@@ -575,7 +575,7 @@ public:
     auto& current_instance = get_data_instance(instance_id);
     auto current_state     = current_instance.get_msir();
 
-    static size_t total_write_back_cnt = 0;
+//    static size_t total_write_back_cnt = 0;
 
     /* Update MSI status depending on the current states and the required access mode */
     switch (current_state)
@@ -597,7 +597,7 @@ public:
         prereqs.merge(ref_instance.get_read_prereq(), current_instance.get_read_prereq());
 
         write_back(memory_node, instance_id, prereqs);
-        total_write_back_cnt++;
+        // total_write_back_cnt++;
         // fprintf(stderr, "WRITE BACK... %s (%ld)!!\n", get_symbol().c_str(), total_write_back_cnt);
 
         ref_instance.add_read_prereq(prereqs);

diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -287,7 +287,6 @@ public:
         }
         else
         {
-          auto grid_dims   = t.grid_dims();
           size_t grid_size = t.grid_dims().size();
           for (size_t i = 0; i < grid_size; i++)
           {
@@ -353,7 +352,7 @@ public:
     }();
 
     const auto [block_size, min_blocks] = conf;
-    const size_t n                      = sub_shape.size();
+    size_t n                      = sub_shape.size();
 
     // If there is no item in that shape, no need to launch a kernel !
     if (n == 0)
@@ -428,7 +427,7 @@ public:
 
       auto arg1                  = mv(explode_deps);
       auto arg2                  = deps.instance(t);
-      void* kernelArgs[]         = {(void*) &n, &arg1, &arg2};
+      void* kernelArgs[]         = {&n, &arg1, &arg2};
       kernel_params.kernelParams = kernelArgs;
       kernel_params.extra        = nullptr;
 

diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
@@ -1065,7 +1065,7 @@ size_t data_hash([[maybe_unused]] mdspan<E, X, L, A> s, ::std::index_sequence<i.
  * Write the content of the mdspan into a file
  */
 template <typename E, typename X, typename L, typename A, size_t... i>
-void data_dump(mdspan<E, X, L, A> s,
+void data_dump([[maybe_unused]] mdspan<E, X, L, A> s,
                ::std::ostream& file        = ::std::cerr,
                ::std::index_sequence<i...> = ::std::index_sequence<>())
 {
@@ -1181,7 +1181,7 @@ struct std::hash<::cuda::experimental::stf::mdspan<P...>>
 
     if constexpr (_dimensions > 1)
     {
-      for (auto i = 1; i < _dimensions; i++)
+      for (size_t i = 1; i < _dimensions; i++)
       {
         cuda::experimental::stf::hash_combine(h, s.stride(i));
       }

diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
@@ -36,6 +36,8 @@ class localized_array
 {
   struct metadata
   {
+    metadata(int dev_, size_t size_, size_t offset_) : alloc_handle{}, dev(dev_), size(size_), offset(offset_) {}
+
     CUmemGenericAllocationHandle alloc_handle;
     int dev;
     size_t size;
@@ -122,10 +124,7 @@ public:
         j++;
       }
 
-      meta.push_back({.alloc_handle = {},
-                      .dev          = grid_pos_to_dev(p),
-                      .size         = j * alloc_granularity_bytes,
-                      .offset       = i * block_size_bytes});
+      meta.emplace_back(grid_pos_to_dev(p), j * alloc_granularity_bytes, i * block_size_bytes);
 
       i += j;
     }

diff --git a/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh
@@ -64,7 +64,7 @@ public:
   }
 
   template <typename mdspan_shape_t>
-  CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
+  CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
   {
     constexpr size_t dimensions = mdspan_shape_t::rank();
 

diff --git a/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh
@@ -202,7 +202,7 @@ public:
   cyclic_partition() = default;
 
   template <size_t dimensions>
-  CUDASTF_HOST_DEVICE static const auto apply(const box<dimensions>& in, pos4 place_position, dim4 grid_dims)
+  CUDASTF_HOST_DEVICE static auto apply(const box<dimensions>& in, pos4 place_position, dim4 grid_dims)
   {
     ::std::array<size_t, dimensions> begins;
     ::std::array<size_t, dimensions> ends;
@@ -218,7 +218,7 @@ public:
   }
 
   template <typename mdspan_shape_t>
-  CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
+  CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims)
   {
     constexpr size_t dimensions = mdspan_shape_t::rank();
 

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -217,13 +217,13 @@ public:
       cuda_safe_call(cuCtxSetCurrent(saved_ctx));
     }
 
-    virtual ::std::string to_string() const override
+    ::std::string to_string() const override
     {
       return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid)
            + ")";
     }
 
-    virtual stream_pool& get_stream_pool(async_resources_handle&, bool) const
+    stream_pool& get_stream_pool(async_resources_handle&, bool) const override
     {
       return *pool;
     }

diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh
@@ -18,6 +18,7 @@
 #include <cuda/experimental/__stf/places/exec/host/callback_queues.cuh>
 #include <cuda/experimental/__stf/utility/traits.cuh>
 #include <stack>
+#include <cstdio>
 
 #define STATEFUL_CALLBACKS
 

diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -804,12 +804,28 @@ public:
 
     bool operator==(const exec_place::impl& rhs) const override
     {
-      if (!exec_place::impl::operator==(rhs))
+      // First, check if rhs is of type exec_place_grid::impl
+      auto other = dynamic_cast<const impl*>(&rhs);
+      if (!other)
       {
-        return false;
+          return false;  // rhs is not a grid, so they are not equal
       }
-      auto other = dynamic_cast<const impl*>(&rhs);
-      return other && dims == other->dims && places == other->places;
+
+      // Compare two grids
+      return *this == *other;
+    }
+
+    // Compare two grids
+    bool operator==(const impl& rhs) const
+    {
+        // First, compare base class properties
+        if (!exec_place::impl::operator==(rhs))
+        {
+            return false;
+        }
+
+        // Compare grid-specific properties
+        return dims == rhs.dims && places == rhs.places;
     }
 
     const ::std::vector<exec_place>& get_places() const
@@ -1009,6 +1025,9 @@ public:
     return ::std::static_pointer_cast<impl>(exec_place::get_impl());
   }
 
+  // Default constructor
+  exec_place_grid() : exec_place(nullptr) {}
+
   // private:
   exec_place_grid(::std::shared_ptr<impl> p)
       : exec_place(mv(p))
@@ -1537,7 +1556,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
     else
     {
-      if (l1_size > block_size_limit)
+      if (int(l1_size) > block_size_limit)
       {
         fprintf(stderr,
                 "Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 1)\n",
@@ -1553,7 +1572,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
 
     // Enforce the resource limits in the number of threads per block
-    assert(l1_size <= block_size_limit);
+    assert(int(l1_size) <= block_size_limit);
 
     assert(l0_size % ndevs == 0);
 
@@ -1589,7 +1608,7 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
     else
     {
-      if (l2_size > block_size_limit)
+      if (int(l2_size) > block_size_limit)
       {
         fprintf(stderr,
                 "Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 2)\n",
@@ -1610,8 +1629,8 @@ interpreted_execution_policy<spec...>::interpreted_execution_policy(
     }
 
     // Enforce the resource limits in the number of threads per block
-    assert(l2_size <= block_size_limit);
-    assert(l0_size <= ndevs);
+    assert(int(l2_size) <= block_size_limit);
+    assert(int(l0_size) <= ndevs);
 
     /* Merge blocks and devices */
     this->add_level({::std::make_pair(hw_scope::device, l0_size)});

diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh
@@ -111,7 +111,11 @@ public:
       // We have currently no way to pass an array of per-place streams
       assert(automatic_stream);
 
-      const auto& places = e_place.as_grid().get_places();
+      // Note: we store grid in a variable to avoid dangling references
+      // because the compiler does not know we are making a refernce to
+      // a vector that remains valid
+      const auto& grid = e_place.as_grid();
+      const auto& places = grid.get_places();
       for (const exec_place& p : places)
       {
         stream_grid.push_back(get_stream_from_pool(p));

diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh
@@ -138,6 +138,7 @@ class print_type_name_and_fail
 template <class T>
 class meyers_singleton
 {
+protected:
   template <class U>
   struct wrapper
   {

diff --git a/cudax/test/stf/cpp/redundant_data.cu b/cudax/test/stf/cpp/redundant_data.cu
@@ -75,7 +75,7 @@ int main()
 {
   double X[n], Y[n];
 
-  for (int ind = 0; ind < n; ind++)
+  for (size_t ind = 0; ind < n; ind++)
   {
     X[ind] = 1.0 * ind;
     Y[ind] = 2.0 * ind - 3.0;