From 3d446f156743a1ef5352debfdb550f93d2cf9a29 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 10 Oct 2024 23:29:47 +0200 Subject: [PATCH 01/19] fix constness issue --- .../cuda/experimental/__stf/internal/parallel_for_scope.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 51ad219827f..f5b0b6d19d4 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -353,7 +353,7 @@ public: }(); const auto [block_size, min_blocks] = conf; - const size_t n = sub_shape.size(); + size_t n = sub_shape.size(); // If there is no item in that shape, no need to launch a kernel ! if (n == 0) @@ -428,7 +428,7 @@ public: auto arg1 = mv(explode_deps); auto arg2 = deps.instance(t); - void* kernelArgs[] = {(void*) &n, &arg1, &arg2}; + void* kernelArgs[] = {&n, &arg1, &arg2}; kernel_params.kernelParams = kernelArgs; kernel_params.extra = nullptr; From 2b30f49f1be58cbe223985930da7df771d2db3d2 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 10 Oct 2024 23:30:12 +0200 Subject: [PATCH 02/19] use size_t instead of integers --- cudax/test/stf/examples/05-stencil-no-copy.cu | 20 +++++++++---------- cudax/test/stf/freeze/freeze.cu | 2 +- cudax/test/stf/graph/graph_composition.cu | 4 +--- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/cudax/test/stf/examples/05-stencil-no-copy.cu b/cudax/test/stf/examples/05-stencil-no-copy.cu index 6202e7fdab3..4d6e4f850dc 100644 --- a/cudax/test/stf/examples/05-stencil-no-copy.cu +++ b/cudax/test/stf/examples/05-stencil-no-copy.cu @@ -52,7 +52,7 @@ T check_sum(stream_ctx& ctx, data_block& bn) auto t = ctx.task(exec_place::host, bn.handle.read()); t->*[&](cudaStream_t stream, auto h_center) { cuda_safe_call(cudaStreamSynchronize(stream)); - for (int offset = bn.ghost_size; offset < bn.ghost_size + bn.block_size; offset++) + for (size_t offset = bn.ghost_size; offset < bn.ghost_size + bn.block_size; offset++) { sum += h_center.data_handle()[offset]; } @@ -147,8 +147,8 @@ int main(int argc, char** argv) stream_ctx ctx; int NITER = 500; - int NBLOCKS = 4 * ndevs; - int BLOCK_SIZE = 2048 * 1024; + size_t NBLOCKS = 4 * ndevs; + size_t BLOCK_SIZE = 2048 * 1024; if (argc > 1) { @@ -174,16 +174,16 @@ int main(int argc, char** argv) std::vector> Un1; // Create blocks and allocates host data - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { - int beg = b * BLOCK_SIZE; - int end = (b + 1) * BLOCK_SIZE; + size_t beg = b * BLOCK_SIZE; + size_t end = (b + 1) * BLOCK_SIZE; Un.emplace_back(ctx, beg, end, 1); Un1.emplace_back(ctx, beg, end, 1); } - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { Un[b].dev_id = b % ndevs; Un1[b].dev_id = b % ndevs; @@ -192,14 +192,14 @@ int main(int argc, char** argv) // Fill blocks with initial values. For the sake of simplicity, we are // using a synchronization primitive and host code, but this could have // been written asynchronously using host callbacks. - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { - int beg = b * BLOCK_SIZE; + size_t beg = b * BLOCK_SIZE; auto t = ctx.task(exec_place::host, Un[b].handle.rw(), Un1[b].handle.rw()); t->*[&](cudaStream_t stream, auto Un_vals, auto Un1_vals) { cuda_safe_call(cudaStreamSynchronize(stream)); - for (int local_idx = 0; local_idx < BLOCK_SIZE; local_idx++) + for (size_t local_idx = 0; local_idx < BLOCK_SIZE; local_idx++) { double val = U0[(beg + local_idx + TOTAL_SIZE) % TOTAL_SIZE]; Un1_vals.data_handle()[local_idx + GHOST_SIZE] = val; diff --git a/cudax/test/stf/freeze/freeze.cu b/cudax/test/stf/freeze/freeze.cu index 691552b9aa6..12ca1119e01 100644 --- a/cudax/test/stf/freeze/freeze.cu +++ b/cudax/test/stf/freeze/freeze.cu @@ -75,7 +75,7 @@ int main() ctx.finalize(); - for (int i = 0; i < N; i++) + for (size_t i = 0; i < N; i++) { EXPECT(X[i] == 2 * X0(i) + 1); } diff --git a/cudax/test/stf/graph/graph_composition.cu b/cudax/test/stf/graph/graph_composition.cu index f8da5502e55..45d298d8bc1 100644 --- a/cudax/test/stf/graph/graph_composition.cu +++ b/cudax/test/stf/graph/graph_composition.cu @@ -90,12 +90,10 @@ int main() size_t N = 256 * 1024; size_t K = 8; - size_t BATCH_SIZE = 4; - logical_data> lX[K]; logical_data> lY[K]; - for (int i = 0; i < K; i++) + for (size_t i = 0; i < K; i++) { lX[i] = ctx.logical_data(N); lY[i] = ctx.logical_data(N); From c8d6eec1627ae60e54e55a9b97a7a5dec91df210 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Thu, 10 Oct 2024 23:40:02 +0200 Subject: [PATCH 03/19] fix constness issue --- cudax/test/stf/cpp/redundant_data.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/test/stf/cpp/redundant_data.cu b/cudax/test/stf/cpp/redundant_data.cu index 95c4caf156b..d494c06d39a 100644 --- a/cudax/test/stf/cpp/redundant_data.cu +++ b/cudax/test/stf/cpp/redundant_data.cu @@ -75,7 +75,7 @@ int main() { double X[n], Y[n]; - for (int ind = 0; ind < n; ind++) + for (size_t ind = 0; ind < n; ind++) { X[ind] = 1.0 * ind; Y[ind] = 2.0 * ind - 3.0; From dfd293b2af163bb7a7ffcc6c638c960dac178673 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 00:28:53 +0200 Subject: [PATCH 04/19] Fix minor warnings --- .devcontainer/cccl-entrypoint.sh | 5 +++- cudax/cmake/cudaxHeaderTesting.cmake | 6 ++--- .../__stf/internal/parallel_for_scope.cuh | 1 - .../experimental/__stf/internal/slice.cuh | 4 +-- .../cuda/experimental/__stf/places/places.cuh | 10 +++---- .../experimental/__stf/utility/traits.cuh | 1 + cudax/test/stf/examples/05-stencil-no-copy.cu | 10 +++---- cudax/test/stf/examples/05-stencil.cu | 26 +++++++++---------- .../test/stf/examples/05-stencil2d-places.cu | 4 +-- .../test/stf/freeze/constant_logical_data.cu | 2 +- cudax/test/stf/freeze/task_fence.cu | 2 +- cudax/test/stf/graph/for_each_batched.cu | 6 ++--- .../test/stf/graph/for_each_batched_write.cu | 12 ++++----- cudax/test/stf/graph/freeze_for_graph.cu | 2 +- cudax/test/stf/graph/graph_composition.cu | 4 +-- cudax/test/stf/green_context/gc_grid.cu | 2 +- 16 files changed, 50 insertions(+), 47 deletions(-) diff --git a/.devcontainer/cccl-entrypoint.sh b/.devcontainer/cccl-entrypoint.sh index 79cd453c771..de42a5d5666 100755 --- a/.devcontainer/cccl-entrypoint.sh +++ b/.devcontainer/cccl-entrypoint.sh @@ -8,7 +8,10 @@ devcontainer-utils-post-create-command; devcontainer-utils-init-git; devcontainer-utils-post-attach-command; -cd /home/coder/cccl/ +# cd /home/coder/cccl/ +echo "CLANG .." +clang-format -i cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh +clang-format --version if test $# -gt 0; then exec "$@"; diff --git a/cudax/cmake/cudaxHeaderTesting.cmake b/cudax/cmake/cudaxHeaderTesting.cmake index c4cd43c5d21..d9a584b5d97 100644 --- a/cudax/cmake/cudaxHeaderTesting.cmake +++ b/cudax/cmake/cudaxHeaderTesting.cmake @@ -57,9 +57,9 @@ function(cudax_add_header_test label definitions) # FIXME: layout_left::mapping referenced before layout_left: cuda/experimental/__stf/supplemental_std_experimental/__p0009_bits/layout_left.hpp - # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114) - cuda/experimental/__stf/stream/stream_task.cuh - cuda/experimental/__stf/stream/stream_ctx.cuh + # # FIXME: error: possibly dangling reference to a temporary (stream_task.cuh:114) + # cuda/experimental/__stf/stream/stream_task.cuh + # cuda/experimental/__stf/stream/stream_ctx.cuh ) target_link_libraries(${headertest_target} PUBLIC ${cn_target}) target_compile_definitions(${headertest_target} PRIVATE diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index f5b0b6d19d4..2344e809d7c 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -287,7 +287,6 @@ public: } else { - auto grid_dims = t.grid_dims(); size_t grid_size = t.grid_dims().size(); for (size_t i = 0; i < grid_size; i++) { diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh index 0e3157f4d17..5cfa19bc1f5 100644 --- a/cudax/include/cuda/experimental/__stf/internal/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh @@ -1065,7 +1065,7 @@ size_t data_hash([[maybe_unused]] mdspan s, ::std::index_sequence -void data_dump(mdspan s, +void data_dump([[maybe_unused]] mdspan s, ::std::ostream& file = ::std::cerr, ::std::index_sequence = ::std::index_sequence<>()) { @@ -1181,7 +1181,7 @@ struct std::hash<::cuda::experimental::stf::mdspan> if constexpr (_dimensions > 1) { - for (auto i = 1; i < _dimensions; i++) + for (size_t i = 1; i < _dimensions; i++) { cuda::experimental::stf::hash_combine(h, s.stride(i)); } diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 9441ddf9f12..5f0914a9922 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -1537,7 +1537,7 @@ interpreted_execution_policy::interpreted_execution_policy( } else { - if (l1_size > block_size_limit) + if (int(l1_size) > block_size_limit) { fprintf(stderr, "Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 1)\n", @@ -1553,7 +1553,7 @@ interpreted_execution_policy::interpreted_execution_policy( } // Enforce the resource limits in the number of threads per block - assert(l1_size <= block_size_limit); + assert(int(l1_size) <= block_size_limit); assert(l0_size % ndevs == 0); @@ -1589,7 +1589,7 @@ interpreted_execution_policy::interpreted_execution_policy( } else { - if (l2_size > block_size_limit) + if (int(l2_size) > block_size_limit) { fprintf(stderr, "Unsatisfiable spec: Maximum block size %d threads, requested %ld (level 2)\n", @@ -1610,8 +1610,8 @@ interpreted_execution_policy::interpreted_execution_policy( } // Enforce the resource limits in the number of threads per block - assert(l2_size <= block_size_limit); - assert(l0_size <= ndevs); + assert(int(l2_size) <= block_size_limit); + assert(int(l0_size) <= ndevs); /* Merge blocks and devices */ this->add_level({::std::make_pair(hw_scope::device, l0_size)}); diff --git a/cudax/include/cuda/experimental/__stf/utility/traits.cuh b/cudax/include/cuda/experimental/__stf/utility/traits.cuh index 2d987a10000..c443b5fea7c 100644 --- a/cudax/include/cuda/experimental/__stf/utility/traits.cuh +++ b/cudax/include/cuda/experimental/__stf/utility/traits.cuh @@ -138,6 +138,7 @@ class print_type_name_and_fail template class meyers_singleton { +protected: template struct wrapper { diff --git a/cudax/test/stf/examples/05-stencil-no-copy.cu b/cudax/test/stf/examples/05-stencil-no-copy.cu index 4d6e4f850dc..b826fa601f1 100644 --- a/cudax/test/stf/examples/05-stencil-no-copy.cu +++ b/cudax/test/stf/examples/05-stencil-no-copy.cu @@ -210,13 +210,13 @@ int main(int argc, char** argv) for (int iter = 0; iter < NITER; iter++) { - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { update_halo(ctx, Un1[b], Un[(b - 1 + NBLOCKS) % NBLOCKS], Un[(b + 1) % NBLOCKS]); } // UPDATE Un from Un1 - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { stencil(ctx, Un[b], Un1[b]); } @@ -225,7 +225,7 @@ int main(int argc, char** argv) if (iter % 250 == 0) { double sum = 0.0; - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { sum += check_sum(ctx, Un[b]); } @@ -233,7 +233,7 @@ int main(int argc, char** argv) // fprintf(stderr, "iter %d : CHECK SUM = %e\n", iter, sum); } - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { // Copy inner part of Un into Un1 copy_inner(ctx, Un[b], Un1[b]); @@ -242,7 +242,7 @@ int main(int argc, char** argv) // In this stencil, the sum of the elements is supposed to be a constant double sum = 0.0; - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { sum += check_sum(ctx, Un[b]); } diff --git a/cudax/test/stf/examples/05-stencil.cu b/cudax/test/stf/examples/05-stencil.cu index 83ad14e2124..04f343a2f36 100644 --- a/cudax/test/stf/examples/05-stencil.cu +++ b/cudax/test/stf/examples/05-stencil.cu @@ -42,7 +42,7 @@ public: ctx.task(exec_place::host, handle.read())->*[&](cudaStream_t stream, auto sn) { cuda_safe_call(cudaStreamSynchronize(stream)); const T* h_center = sn.data_handle(); - for (int offset = ghost_size; offset < ghost_size + block_size; offset++) + for (size_t offset = ghost_size; offset < ghost_size + block_size; offset++) { sum += h_center[offset]; } @@ -145,8 +145,8 @@ void copy_array(data_block& bn, data_block& bn1) int main(int argc, char** argv) { int NITER = 500; - int NBLOCKS = 4; - int BLOCK_SIZE = 2048 * 1024; + size_t NBLOCKS = 4; + size_t BLOCK_SIZE = 2048 * 1024; if (argc > 1) { @@ -177,7 +177,7 @@ int main(int argc, char** argv) std::vector> Un1; // Create blocks and allocates host data - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { int beg = b * BLOCK_SIZE; int end = (b + 1) * BLOCK_SIZE; @@ -186,7 +186,7 @@ int main(int argc, char** argv) Un1.emplace_back(beg, end, 1); } - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { Un[b].preferred_device = b % ndevs; Un1[b].preferred_device = b % ndevs; @@ -195,7 +195,7 @@ int main(int argc, char** argv) // Fill blocks with initial values. For the sake of simplicity, we are // using a synchronization primitive and host code, but this could have // been written asynchronously using host callbacks. - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { int beg = b * BLOCK_SIZE; @@ -205,7 +205,7 @@ int main(int argc, char** argv) double* Un_vals = sUn.data_handle(); double* Un1_vals = sUn1.data_handle(); - for (int local_idx = 0; local_idx < BLOCK_SIZE; local_idx++) + for (size_t local_idx = 0; local_idx < BLOCK_SIZE; local_idx++) { Un1_vals[local_idx + GHOST_SIZE] = U0[(beg + local_idx + TOTAL_SIZE) % TOTAL_SIZE]; } @@ -214,25 +214,25 @@ int main(int argc, char** argv) for (int iter = 0; iter < NITER; iter++) { - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { // Update the internal copies of the left and right boundaries update_inner_interfaces(Un1[b]); } - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { // Apply ghost cells from neighbours to put then in the "center" array update_outer_interfaces(Un1[b], Un1[(b - 1 + NBLOCKS) % NBLOCKS], Un1[(b + 1) % NBLOCKS]); } // UPDATE Un from Un1 - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { stencil(Un[b], Un1[b]); } - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { // Save Un into Un1 copy_array(Un[b], Un1[b]); @@ -242,7 +242,7 @@ int main(int argc, char** argv) if (iter % 250 == 0) { double check_sum = 0.0; - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { check_sum += Un[b].check_sum(); } @@ -253,7 +253,7 @@ int main(int argc, char** argv) // In this stencil, the sum of the elements is supposed to be a constant double check_sum = 0.0; - for (int b = 0; b < NBLOCKS; b++) + for (size_t b = 0; b < NBLOCKS; b++) { check_sum += Un[b].check_sum(); } diff --git a/cudax/test/stf/examples/05-stencil2d-places.cu b/cudax/test/stf/examples/05-stencil2d-places.cu index 828dbacac9c..2fde20dd7b9 100644 --- a/cudax/test/stf/examples/05-stencil2d-places.cu +++ b/cudax/test/stf/examples/05-stencil2d-places.cu @@ -74,7 +74,7 @@ int main(int argc, char** argv) // TODO implement the proper tiled_partitioning along y ! data_place cdp = data_place::composite(tiled_partition<128>(), all_devs); - for (int iter = 0; iter < NITER; iter++) + for (size_t iter = 0; iter < NITER; iter++) { // UPDATE Un from Un1 ctx.task(lUn.rw(cdp), lUn1.read(cdp))->*[&](auto stream, auto sUn, auto sUn1) { @@ -99,7 +99,7 @@ int main(int argc, char** argv) if (vtk_dump) { char str[32]; - snprintf(str, 32, "Un_%05d.vtk", iter); + snprintf(str, 32, "Un_%05zu.vtk", iter); mdspan_to_vtk(sUn, std::string(str)); } }; diff --git a/cudax/test/stf/freeze/constant_logical_data.cu b/cudax/test/stf/freeze/constant_logical_data.cu index 716607b86fb..0f410a1322b 100644 --- a/cudax/test/stf/freeze/constant_logical_data.cu +++ b/cudax/test/stf/freeze/constant_logical_data.cu @@ -65,7 +65,7 @@ int main() ctx.finalize(); - for (int i = 0; i < N; i++) + for (size_t i = 0; i < N; i++) { EXPECT(X[i] == (5 * i - 3) + 4 * (18 * i - 9) + 4 * (4 * i - 2)); } diff --git a/cudax/test/stf/freeze/task_fence.cu b/cudax/test/stf/freeze/task_fence.cu index 04898c7d7c4..9aa5716aa56 100644 --- a/cudax/test/stf/freeze/task_fence.cu +++ b/cudax/test/stf/freeze/task_fence.cu @@ -93,7 +93,7 @@ int main() ctx.finalize(); - for (int i = 0; i < N; i++) + for (size_t i = 0; i < N; i++) { EXPECT(X[i] == 2 * X0(i) + 1); } diff --git a/cudax/test/stf/graph/for_each_batched.cu b/cudax/test/stf/graph/for_each_batched.cu index d8714a6b0e6..a640fcc9448 100644 --- a/cudax/test/stf/graph/for_each_batched.cu +++ b/cudax/test/stf/graph/for_each_batched.cu @@ -88,15 +88,15 @@ int main() stream_ctx ctx; - size_t N = 256 * 1024; - size_t K = 8; + const size_t N = 256 * 1024; + const size_t K = 8; size_t BATCH_SIZE = 4; logical_data> lX[K]; logical_data> lY[K]; - for (int i = 0; i < K; i++) + for (size_t i = 0; i < K; i++) { lX[i] = ctx.logical_data(N); lY[i] = ctx.logical_data(N); diff --git a/cudax/test/stf/graph/for_each_batched_write.cu b/cudax/test/stf/graph/for_each_batched_write.cu index 7ab09735988..2d044d5f678 100644 --- a/cudax/test/stf/graph/for_each_batched_write.cu +++ b/cudax/test/stf/graph/for_each_batched_write.cu @@ -35,15 +35,15 @@ int main() { stream_ctx ctx; - size_t N = 256 * 1024; - size_t K = 8; + const size_t N = 256 * 1024; + const size_t K = 8; - size_t BATCH_SIZE = 4; + const size_t BATCH_SIZE = 4; logical_data> lX[K]; logical_data> lY[K]; - for (int i = 0; i < K; i++) + for (size_t i = 0; i < K; i++) { lX[i] = ctx.logical_data(N); lX[i].set_symbol("x" + std::to_string(i)); @@ -52,7 +52,7 @@ int main() }; } - for (int i = 0; i < K; i++) + for (size_t i = 0; i < K; i++) { // NOT INITIALIZED lY[i] = ctx.logical_data(N); @@ -70,7 +70,7 @@ int main() COPY(inner_ctx, lxi, lyi); }; - for (int i = 0; i < K; i++) + for (size_t i = 0; i < K; i++) { // TODO check actual content ctx.task(lY[i].read()).set_symbol("CHECK")->*[](cudaStream_t, auto) {}; diff --git a/cudax/test/stf/graph/freeze_for_graph.cu b/cudax/test/stf/graph/freeze_for_graph.cu index 068f23b7862..f445a89fda1 100644 --- a/cudax/test/stf/graph/freeze_for_graph.cu +++ b/cudax/test/stf/graph/freeze_for_graph.cu @@ -64,7 +64,7 @@ int main() fX.unfreeze(stream); ctx.host_launch(lX.read())->*[](auto x) { - for (int i = 0; i < x.size(); i++) + for (size_t i = 0; i < x.size(); i++) { EXPECT(x(i) == X0(i) + 2); } diff --git a/cudax/test/stf/graph/graph_composition.cu b/cudax/test/stf/graph/graph_composition.cu index 45d298d8bc1..91985720217 100644 --- a/cudax/test/stf/graph/graph_composition.cu +++ b/cudax/test/stf/graph/graph_composition.cu @@ -87,8 +87,8 @@ int main() stream_ctx ctx; - size_t N = 256 * 1024; - size_t K = 8; + const size_t N = 256 * 1024; + const size_t K = 8; logical_data> lX[K]; logical_data> lY[K]; diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu index 8e81f1ca6b3..06322b5343f 100644 --- a/cudax/test/stf/green_context/gc_grid.cu +++ b/cudax/test/stf/green_context/gc_grid.cu @@ -89,7 +89,7 @@ int main() auto& g_ctx = gc[devid]; auto cnt = g_ctx.get_count(); - for (int i = 0; i < cnt; i++) + for (size_t i = 0; i < cnt; i++) { places.push_back(exec_place::green_ctx(g_ctx.get_view(i))); } From 15f219752b43ee380f564e6c65b9b09274b108b7 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 01:02:46 +0200 Subject: [PATCH 05/19] Fix sizeness, constness and VLA issues --- cudax/examples/stf/1f1b.cu | 4 ++-- cudax/examples/stf/binary_fhe.cu | 2 +- cudax/examples/stf/graph_algorithms/pagerank.cu | 4 ++-- cudax/examples/stf/mandelbrot.cu | 4 ++-- cudax/test/stf/interface/data_from_device.cu | 2 +- cudax/test/stf/interface/data_from_device_2.cu | 2 +- cudax/test/stf/interface/data_from_device_wb.cu | 2 +- cudax/test/stf/interface/graph_use_device_data.cu | 4 ++-- cudax/test/stf/reclaiming/graph.cu | 2 +- cudax/test/stf/reclaiming/graph_2.cu | 2 +- cudax/test/stf/reclaiming/stream.cu | 2 +- cudax/test/stf/stencil/stencil-1D.cu | 14 +++++++------- cudax/test/stf/stress/empty_tasks_alloc.cu | 2 +- cudax/test/stf/stress/task_bench.cu | 2 +- 14 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cudax/examples/stf/1f1b.cu b/cudax/examples/stf/1f1b.cu index 2add7930d79..7f3ef24ef89 100644 --- a/cudax/examples/stf/1f1b.cu +++ b/cudax/examples/stf/1f1b.cu @@ -81,7 +81,7 @@ int main(int argc, char** argv) { for (size_t iter = 0; iter < niter; iter++) { size_t task_cnt = 0; for (size_t b = 0; b < num_batches; b++) { - for (size_t d = 0; d < num_devs; d++) { + for (int d = 0; d < num_devs; d++) { ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) { int ms = 10; long long int clock_cnt = (long long int) (ms * clock_rate / factor); @@ -92,7 +92,7 @@ int main(int argc, char** argv) { // } // // for (size_t b = 0; b < num_batches; b++) { - for (size_t d = num_devs; d-- > 0;) { + for (int d = num_devs; d-- > 0;) { ctx.task(exec_place::device(d % real_devs), data[b].rw())->*[=](cudaStream_t s, auto bd) { int ms = 20; long long int clock_cnt = (long long int) (ms * clock_rate / factor); diff --git a/cudax/examples/stf/binary_fhe.cu b/cudax/examples/stf/binary_fhe.cu index 2898d15119a..1e0f5c226c1 100644 --- a/cudax/examples/stf/binary_fhe.cu +++ b/cudax/examples/stf/binary_fhe.cu @@ -24,7 +24,7 @@ class plaintext { public: plaintext(const context& ctx) : ctx(ctx) {} - plaintext(context& ctx, std::vector v) : ctx(ctx), values(v) { + plaintext(context& ctx, std::vector v) : values(v), ctx(ctx) { l = ctx.logical_data(&values[0], values.size()); } diff --git a/cudax/examples/stf/graph_algorithms/pagerank.cu b/cudax/examples/stf/graph_algorithms/pagerank.cu index 521851acfd8..fa66658d84f 100644 --- a/cudax/examples/stf/graph_algorithms/pagerank.cu +++ b/cudax/examples/stf/graph_algorithms/pagerank.cu @@ -124,8 +124,8 @@ int main() { printf("Page rank answer is %s.\n", abs(sum_pageranks - 1.0) < 0.001 ? "correct" : "not correct"); printf("PageRank Results:\n"); - for (int i = 0; i < page_rank.size(); ++i) { - printf("Vertex %d: %f\n", i, page_rank[i]); + for (size_t i = 0; i < page_rank.size(); ++i) { + printf("Vertex %ld: %f\n", i, page_rank[i]); } return 0; diff --git a/cudax/examples/stf/mandelbrot.cu b/cudax/examples/stf/mandelbrot.cu index d0cbd2665e0..8d0fb64920d 100644 --- a/cudax/examples/stf/mandelbrot.cu +++ b/cudax/examples/stf/mandelbrot.cu @@ -95,8 +95,8 @@ int main(int argc, char** argv) { imageFile << width << " " << height << "\n"; imageFile << "255\n"; - for (int y = 0; y < height; y++) - for (int x = 0; x < width; x++) { + for (size_t y = 0; y < height; y++) + for (size_t x = 0; x < width; x++) { int iterations = buffer(x, y); // Convert iterations to RGB values unsigned char r = (iterations % 8) * 32; diff --git a/cudax/test/stf/interface/data_from_device.cu b/cudax/test/stf/interface/data_from_device.cu index 1eb8b0bb20f..69ccbc74666 100644 --- a/cudax/test/stf/interface/data_from_device.cu +++ b/cudax/test/stf/interface/data_from_device.cu @@ -63,7 +63,7 @@ void run() // Access Ask to use X, Y and Z on the host ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto X, auto Y) { - for (int ind = 0; ind < n; ind++) + for (size_t ind = 0; ind < n; ind++) { // X unchanged EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001); diff --git a/cudax/test/stf/interface/data_from_device_2.cu b/cudax/test/stf/interface/data_from_device_2.cu index 71cacbda7e0..74d5284e988 100644 --- a/cudax/test/stf/interface/data_from_device_2.cu +++ b/cudax/test/stf/interface/data_from_device_2.cu @@ -76,7 +76,7 @@ void run() // Access Ask to use X, Y and Z on the host ctx.host_launch(handle_X.read(), handle_Y.read(), handle_Z.read())->*[&](auto X, auto Y, auto Z) { - for (int ind = 0; ind < n; ind++) + for (size_t ind = 0; ind < n; ind++) { // X unchanged EXPECT(fabs(X(ind) - 1.0 * ind) < 0.00001); diff --git a/cudax/test/stf/interface/data_from_device_wb.cu b/cudax/test/stf/interface/data_from_device_wb.cu index fae695109b0..be7ff580582 100644 --- a/cudax/test/stf/interface/data_from_device_wb.cu +++ b/cudax/test/stf/interface/data_from_device_wb.cu @@ -59,7 +59,7 @@ void run() }; ctx.host_launch(handle_X.rw())->*[&](auto X) { - for (int ind = 0; ind < n; ind++) + for (size_t ind = 0; ind < n; ind++) { X(ind) = 2.0 * X(ind) + 1.0; } diff --git a/cudax/test/stf/interface/graph_use_device_data.cu b/cudax/test/stf/interface/graph_use_device_data.cu index 8117e0ce750..25af47d0af3 100644 --- a/cudax/test/stf/interface/graph_use_device_data.cu +++ b/cudax/test/stf/interface/graph_use_device_data.cu @@ -42,7 +42,7 @@ __global__ void setup_vectors(int n, T* x, T* y, T* z) int main(int argc, char** argv) { graph_ctx ctx; - size_t n = 12; + const size_t n = 12; const double alpha = 2.0; double *dX, *dY, *dZ; @@ -77,7 +77,7 @@ int main(int argc, char** argv) }; ctx.host_launch(handle_X.read(), handle_Y.read(), handle_Z.read())->*[&](auto hX, auto hY, auto hZ) { - for (int ind = 0; ind < n; ind++) + for (size_t ind = 0; ind < n; ind++) { // X unchanged EXPECT(fabs(hX(ind) - 1.0 * ind) < 0.00001); diff --git a/cudax/test/stf/reclaiming/graph.cu b/cudax/test/stf/reclaiming/graph.cu index 553b6434a6e..8d4973fe092 100644 --- a/cudax/test/stf/reclaiming/graph.cu +++ b/cudax/test/stf/reclaiming/graph.cu @@ -37,7 +37,7 @@ int main(int argc, char** argv) graph_ctx ctx; - logical_data> handles[nblocks]; + ::std::vector>> handles(nblocks); char* h_buffer = new char[nblocks * block_size]; diff --git a/cudax/test/stf/reclaiming/graph_2.cu b/cudax/test/stf/reclaiming/graph_2.cu index 5778172f30f..9099a7be571 100644 --- a/cudax/test/stf/reclaiming/graph_2.cu +++ b/cudax/test/stf/reclaiming/graph_2.cu @@ -36,7 +36,7 @@ int main(int argc, char** argv) setenv("MAX_ALLOC_CNT", "2", 1); graph_ctx ctx; - logical_data> handles[nblocks]; + ::std::vector>> handles(nblocks); std::vector h_buffer(nblocks * block_size); for (int i = 0; i < nblocks; i++) diff --git a/cudax/test/stf/reclaiming/stream.cu b/cudax/test/stf/reclaiming/stream.cu index a2d6dac67c6..b05c6f73722 100644 --- a/cudax/test/stf/reclaiming/stream.cu +++ b/cudax/test/stf/reclaiming/stream.cu @@ -113,7 +113,7 @@ int main(int argc, char** argv) auto dummy_alloc = block_allocator(ctx); ctx.set_allocator(dummy_alloc); - logical_data> handles[nblocks]; + ::std::vector>> handles(nblocks); char* h_buffer = new char[nblocks * block_size]; diff --git a/cudax/test/stf/stencil/stencil-1D.cu b/cudax/test/stf/stencil/stencil-1D.cu index a7f975f4cae..843a1fb13d9 100644 --- a/cudax/test/stf/stencil/stencil-1D.cu +++ b/cudax/test/stf/stencil/stencil-1D.cu @@ -28,10 +28,10 @@ public: : ghost_size(ghost_size) , block_size(end - beg + 1) , array(new T[block_size + 2 * ghost_size]) - , handle(ctx.logical_data(array.get(), block_size + 2 * ghost_size)) , left_interface(new T[ghost_size]) - , left_handle(ctx.logical_data(left_interface.get(), ghost_size)) , right_interface(new T[ghost_size]) + , handle(ctx.logical_data(array.get(), block_size + 2 * ghost_size)) + , left_handle(ctx.logical_data(left_interface.get(), ghost_size)) , right_handle(ctx.logical_data(right_interface.get(), ghost_size)) {} @@ -46,8 +46,8 @@ public: } public: - size_t block_size; size_t ghost_size; + size_t block_size; std::unique_ptr array; std::unique_ptr left_interface; std::unique_ptr right_interface; @@ -55,11 +55,11 @@ public: // HANDLE = whole data + boundaries logical_data> handle; - // A piece of data to store the right part of the block - logical_data> right_handle; - // A piece of data to store the left part of the block logical_data> left_handle; + + // A piece of data to store the right part of the block + logical_data> right_handle; }; template @@ -173,7 +173,7 @@ int main(int argc, char** argv) // Create the graph - it starts out empty int NITER = 400; - for (size_t iter = 0; iter < NITER; iter++) + for (int iter = 0; iter < NITER; iter++) { // UPDATE Un from Un1 for (size_t b = 0; b < NBLOCKS; b++) diff --git a/cudax/test/stf/stress/empty_tasks_alloc.cu b/cudax/test/stf/stress/empty_tasks_alloc.cu index 2a43ec9b098..7bfeae653e1 100644 --- a/cudax/test/stf/stress/empty_tasks_alloc.cu +++ b/cudax/test/stf/stress/empty_tasks_alloc.cu @@ -30,7 +30,7 @@ int main(int argc, char** argv) std::chrono::steady_clock::time_point start, stop; start = std::chrono::steady_clock::now(); - for (size_t iter = 0; iter < iter_cnt; iter++) + for (int iter = 0; iter < iter_cnt; iter++) { auto lX = ctx.logical_data(shape_of>(N)); auto lY = ctx.logical_data(shape_of>(N)); diff --git a/cudax/test/stf/stress/task_bench.cu b/cudax/test/stf/stress/task_bench.cu index bc43357ac4c..f1cc20d699a 100644 --- a/cudax/test/stf/stress/task_bench.cu +++ b/cudax/test/stf/stress/task_bench.cu @@ -174,7 +174,7 @@ void bench(context& ctx, test_id id, size_t width, size_t nsteps, size_t repeat_ { std::chrono::steady_clock::time_point start, stop; - const int b = nsteps; + const size_t b = nsteps; std::vector>> data(width * b); const size_t data_size = 128; From be00476ce5dc979ddf8932d2513f391273a681ac Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 09:41:26 +0200 Subject: [PATCH 06/19] Solve an unused variable issue due to constexpr --- .../__stf/internal/backend_ctx.cuh | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 05701dd78f1..6ab684fb96a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -281,17 +281,6 @@ public: dot.template add_vertex(t); } - auto insert_one_kernel = [](cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) { - cudaKernelNodeParams kconfig; - kconfig.blockDim = k.blockDim; - kconfig.extra = nullptr; - kconfig.func = (void*) k.func; - kconfig.gridDim = k.gridDim; - kconfig.kernelParams = k.args_ptr.data(); - kconfig.sharedMemBytes = k.sharedMem; - cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); - }; - // When chained is enable, we expect a vector of kernel description which should be executed one after the other if constexpr (chained) { @@ -362,6 +351,18 @@ public: } private: + /* Add a kernel to a CUDA graph given its description */ + auto insert_one_kernel(cuda_kernel_desc& k, cudaGraphNode_t& n, cudaGraph_t& g) const { + cudaKernelNodeParams kconfig; + kconfig.blockDim = k.blockDim; + kconfig.extra = nullptr; + kconfig.func = const_cast(k.func); + kconfig.gridDim = k.gridDim; + kconfig.kernelParams = k.args_ptr.data(); + kconfig.sharedMemBytes = k.sharedMem; + cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); + } + ::std::string symbol; Ctx& ctx; task_dep_vector deps; From f07a1a1d2154c0ac15a6d9817f0697f014fb106a Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 09:41:50 +0200 Subject: [PATCH 07/19] fix constness issue in logical_data arguments --- cudax/examples/stf/word_count.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/examples/stf/word_count.cu b/cudax/examples/stf/word_count.cu index 32be1d9e1fd..89b827c2ef9 100644 --- a/cudax/examples/stf/word_count.cu +++ b/cudax/examples/stf/word_count.cu @@ -40,7 +40,7 @@ int main() { context ctx; - auto ltext = ctx.logical_data((char*) &raw_input[0], { sizeof(raw_input) }); + auto ltext = ctx.logical_data(const_cast(&raw_input[0]), { sizeof(raw_input) }); int cnt = 0; auto lcnt = ctx.logical_data(&cnt, { 1 }); From 78cabc1eb265573b9ebc86824b4b35fcf6aa6c1e Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 09:45:47 +0200 Subject: [PATCH 08/19] Solve a parsing error in g++ --- cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh index 6ab684fb96a..ccc33258347 100644 --- a/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh @@ -1069,7 +1069,7 @@ private: template auto make_data_interface(P&&... p) { - return ::std::make_shared>(::std::forward

(p)...); + return ::std::make_shared>(::std::forward

(p)...); } }; From 87f8a91dd3340fb1cb16b94a51c824fab7f0636c Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 10:47:18 +0200 Subject: [PATCH 09/19] Fix the equality operator of the exec_place_grid to avoid ambiguities in C++20 --- .../cuda/experimental/__stf/places/places.cuh | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index 5f0914a9922..d2e146f101d 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -804,12 +804,28 @@ public: bool operator==(const exec_place::impl& rhs) const override { - if (!exec_place::impl::operator==(rhs)) + // First, check if rhs is of type exec_place_grid::impl + auto other = dynamic_cast(&rhs); + if (!other) { - return false; + return false; // rhs is not a grid, so they are not equal } - auto other = dynamic_cast(&rhs); - return other && dims == other->dims && places == other->places; + + // Compare two grids + return *this == *other; + } + + // Compare two grids + bool operator==(const impl& rhs) const + { + // First, compare base class properties + if (!exec_place::impl::operator==(rhs)) + { + return false; + } + + // Compare grid-specific properties + return dims == rhs.dims && places == rhs.places; } const ::std::vector& get_places() const From 86af6d491b39f044971b769228ae225ea47fd24d Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 11:06:14 +0200 Subject: [PATCH 10/19] avoid a dangling reference warning --- .../include/cuda/experimental/__stf/stream/stream_task.cuh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh index b3fcb4b72c1..28fa0ec6028 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_task.cuh @@ -111,7 +111,11 @@ public: // We have currently no way to pass an array of per-place streams assert(automatic_stream); - const auto& places = e_place.as_grid().get_places(); + // Note: we store grid in a variable to avoid dangling references + // because the compiler does not know we are making a refernce to + // a vector that remains valid + const auto& grid = e_place.as_grid(); + const auto& places = grid.get_places(); for (const exec_place& p : places) { stream_grid.push_back(get_stream_from_pool(p)); From ac389ce44179356026ea03c68ba566d0f12039a6 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 11:23:20 +0200 Subject: [PATCH 11/19] Add a missing header --- .../cuda/experimental/__stf/places/exec/host/callback_queues.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh index 88c1a04dca5..e38dcde77b8 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/host/callback_queues.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #define STATEFUL_CALLBACKS From 6107281f08bbc091ebdf1cf554b2b46fca858c50 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 11:49:50 +0200 Subject: [PATCH 12/19] Make the unique id mutable to make it possible to have a default constructor --- cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh index 9063f4ead1a..49c3dbcf7d7 100644 --- a/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/async_prereq.cuh @@ -149,7 +149,7 @@ public: } /// A unique identifier for the event, used to ensure proper event ordering. - const unique_id_t unique_prereq_id; + mutable unique_id_t unique_prereq_id; ::std::atomic outbound_deps = 0; From 72887f5e99b3ac702784907477ce65696b0e2794 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:00:12 +0200 Subject: [PATCH 13/19] Add mutable to allow a default ctor --- .../cuda/experimental/__stf/graph/internal/event_types.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh index 6aa7d127e3e..f017f24ff47 100644 --- a/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/internal/event_types.cuh @@ -34,8 +34,8 @@ protected: } public: - const cudaGraphNode_t node; - size_t epoch; + mutable cudaGraphNode_t node; + mutable size_t epoch; }; using graph_event = handle; From 81e909089fd3b19ce52ea49371b41757ad849d33 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:00:38 +0200 Subject: [PATCH 14/19] remove unused variable --- .../include/cuda/experimental/__stf/internal/logical_data.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh index 824f8fe9665..05724ab70d2 100644 --- a/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/logical_data.cuh @@ -575,7 +575,7 @@ public: auto& current_instance = get_data_instance(instance_id); auto current_state = current_instance.get_msir(); - static size_t total_write_back_cnt = 0; +// static size_t total_write_back_cnt = 0; /* Update MSI status depending on the current states and the required access mode */ switch (current_state) @@ -597,7 +597,7 @@ public: prereqs.merge(ref_instance.get_read_prereq(), current_instance.get_read_prereq()); write_back(memory_node, instance_id, prereqs); - total_write_back_cnt++; + // total_write_back_cnt++; // fprintf(stderr, "WRITE BACK... %s (%ld)!!\n", get_symbol().c_str(), total_write_back_cnt); ref_instance.add_read_prereq(prereqs); From 907a4cbcf8128b9b4e1b368577ff6c08369d4467 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:01:15 +0200 Subject: [PATCH 15/19] Avoid a GNU specific syntax by providing a ctor for metadata --- .../experimental/__stf/localization/composite_slice.cuh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh index c0115ffc275..3ec4f6a2408 100644 --- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh +++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh @@ -36,6 +36,8 @@ class localized_array { struct metadata { + metadata(int dev_, size_t size_, size_t offset_) : alloc_handle{}, dev(dev_), size(size_), offset(offset_) {} + CUmemGenericAllocationHandle alloc_handle; int dev; size_t size; @@ -122,10 +124,7 @@ public: j++; } - meta.push_back({.alloc_handle = {}, - .dev = grid_pos_to_dev(p), - .size = j * alloc_granularity_bytes, - .offset = i * block_size_bytes}); + meta.emplace_back(grid_pos_to_dev(p), j * alloc_granularity_bytes, i * block_size_bytes); i += j; } From 255c8d4b06133b592f28a8f60bdfd357aaa4cb59 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:01:43 +0200 Subject: [PATCH 16/19] add missing override keyword and remove extra virtual --- .../cuda/experimental/__stf/places/exec/green_context.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh index 29623bed7d9..9f9ab3bdb65 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -217,13 +217,13 @@ public: cuda_safe_call(cuCtxSetCurrent(saved_ctx)); } - virtual ::std::string to_string() const override + ::std::string to_string() const override { return "green ctx ( id=" + ::std::to_string(get_cuda_context_id(g_ctx)) + " dev_id =" + ::std::to_string(devid) + ")"; } - virtual stream_pool& get_stream_pool(async_resources_handle&, bool) const + stream_pool& get_stream_pool(async_resources_handle&, bool) const override { return *pool; } From f7608d0908a553975eb7751342f256b016c80869 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:01:58 +0200 Subject: [PATCH 17/19] default exec_place_grid ctor --- cudax/include/cuda/experimental/__stf/places/places.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index d2e146f101d..8b9b740f545 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -1025,6 +1025,9 @@ public: return ::std::static_pointer_cast(exec_place::get_impl()); } + // Default constructor + exec_place_grid() : exec_place(nullptr) {} + // private: exec_place_grid(::std::shared_ptr p) : exec_place(mv(p)) From b9a2617b18d7979ff1b463f133884cfd70bbe2a5 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:17:46 +0200 Subject: [PATCH 18/19] Remove useless const keywords --- .../cuda/experimental/__stf/places/blocked_partition.cuh | 2 +- cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh index 27a0e5e198b..8f1c042e885 100644 --- a/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh +++ b/cudax/include/cuda/experimental/__stf/places/blocked_partition.cuh @@ -64,7 +64,7 @@ public: } template - CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) + CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) { constexpr size_t dimensions = mdspan_shape_t::rank(); diff --git a/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh index 9e8e850d051..98134e69349 100644 --- a/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh +++ b/cudax/include/cuda/experimental/__stf/places/cyclic_shape.cuh @@ -202,7 +202,7 @@ public: cyclic_partition() = default; template - CUDASTF_HOST_DEVICE static const auto apply(const box& in, pos4 place_position, dim4 grid_dims) + CUDASTF_HOST_DEVICE static auto apply(const box& in, pos4 place_position, dim4 grid_dims) { ::std::array begins; ::std::array ends; @@ -218,7 +218,7 @@ public: } template - CUDASTF_HOST_DEVICE static const auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) + CUDASTF_HOST_DEVICE static auto apply(const mdspan_shape_t& in, pos4 place_position, dim4 grid_dims) { constexpr size_t dimensions = mdspan_shape_t::rank(); From 64f7b93cd28a396053794933858dcb7de1e0fa04 Mon Sep 17 00:00:00 2001 From: Cedric Augonnet Date: Fri, 11 Oct 2024 13:22:57 +0200 Subject: [PATCH 19/19] Revert change commited by mistake --- .devcontainer/cccl-entrypoint.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.devcontainer/cccl-entrypoint.sh b/.devcontainer/cccl-entrypoint.sh index de42a5d5666..79cd453c771 100755 --- a/.devcontainer/cccl-entrypoint.sh +++ b/.devcontainer/cccl-entrypoint.sh @@ -8,10 +8,7 @@ devcontainer-utils-post-create-command; devcontainer-utils-init-git; devcontainer-utils-post-attach-command; -# cd /home/coder/cccl/ -echo "CLANG .." -clang-format -i cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh -clang-format --version +cd /home/coder/cccl/ if test $# -gt 0; then exec "$@";