From e62b52fe781ba155f442dac4bf61144abfe7b522 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 18 Dec 2024 15:55:25 +0100 Subject: [PATCH 01/26] add ani4 to symbolic cholesky test set --- test/factorization/cholesky_kernels.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index ae2894d3696..6d352c135e7 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -88,6 +88,11 @@ class CholeskySymbolic : public CommonTestFixture { std::ifstream ani1_amd_stream{gko::matrices::location_ani1_amd_mtx}; matrices.emplace_back("ani1_amd", gko::read(ani1_amd_stream, ref)); + std::ifstream ani4_stream{gko::matrices::location_ani4_mtx}; + matrices.emplace_back("ani4", gko::read(ani4_stream, ref)); + std::ifstream ani4_amd_stream{gko::matrices::location_ani4_amd_mtx}; + matrices.emplace_back("ani4_amd", + gko::read(ani4_amd_stream, ref)); } void assert_equal_forests(elimination_forest& lhs, elimination_forest& rhs, From 016d0ea71f7509177033f0902d5baf2a5e5623d5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 18 Dec 2024 15:56:35 +0100 Subject: [PATCH 02/26] fix sparsity pattern output --- core/test/utils/assertions.hpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 3dae62151b3..13e5dfe0ea5 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -108,6 +108,20 @@ class biggest_valuetype< }; +template +auto get_next_value_optional(NonzeroIterator& it, const NonzeroIterator& end, + size_type next_row, size_type next_col) + -> std::optionalvalue)>::type> +{ + if (it != end && it->row == next_row && it->column == next_col) { + return std::optional((it++)->value); + } else { + // monostate + return {}; + } +} + + template auto get_next_value(NonzeroIterator& it, const NonzeroIterator& end, size_type next_row, size_type next_col) -> @@ -190,11 +204,12 @@ void print_sparsity_pattern(Ostream& os, const MatrixData1& first, os << (row % 10); for (size_type col = 0; col < first.size[1]; col++) { const auto has_first = - get_next_value(first_it, end(first.nonzeros), row, col) != - zero(); + get_next_value_optional(first_it, end(first.nonzeros), row, col) + .has_value(); const auto has_second = - get_next_value(second_it, end(second.nonzeros), row, col) != - zero(); + get_next_value_optional(second_it, end(second.nonzeros), row, + col) + .has_value(); if (has_first) { if (has_second) { os << '+'; From 91f5b97181a3b268ee258c29b8a2979fc0c9b37d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 18 Dec 2024 15:56:54 +0100 Subject: [PATCH 03/26] fix incorrect include --- common/cuda_hip/factorization/cholesky_kernels.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 7ff1382d8c6..d5badf6ef43 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -45,9 +45,6 @@ namespace cholesky { constexpr int default_block_size = 512; -#include "core/factorization/elimination_forest.hpp" - - namespace kernel { @@ -393,6 +390,15 @@ void factorize(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); + + template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, From 50924cafe42aa6f5f13fae1382c97359e14b7942 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 18 Dec 2024 16:04:10 +0100 Subject: [PATCH 04/26] add Cholesky skeleton tree computation kernel --- core/device_hooks/common_kernels.inc.cpp | 1 + core/factorization/cholesky_kernels.hpp | 9 ++++ dpcpp/factorization/cholesky_kernels.dp.cpp | 9 ++++ omp/factorization/cholesky_kernels.cpp | 48 +++++++++++++++++++ reference/factorization/cholesky_kernels.cpp | 47 ++++++++++++++++++ .../test/factorization/cholesky_kernels.cpp | 31 ++++++++++++ test/factorization/cholesky_kernels.cpp | 24 ++++++++++ 7 files changed, 169 insertions(+) diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 034bc73e388..668a4e0f6bf 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -927,6 +927,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); namespace cholesky { +GKO_STUB_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index 8230fa4552d..9a9a165e6b4 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -20,6 +20,13 @@ namespace gko { namespace kernels { +#define GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType) \ + void compute_skeleton_tree(std::shared_ptr exec, \ + const IndexType* row_ptrs, \ + const IndexType* cols, size_type size, \ + IndexType* out_row_ptrs, IndexType* out_cols) + + #define GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType) \ void symbolic_count( \ std::shared_ptr exec, \ @@ -66,6 +73,8 @@ namespace kernels { #define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType); \ template \ GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType); \ template \ diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp index 5cd8396be17..1461f918be7 100644 --- a/dpcpp/factorization/cholesky_kernels.dp.cpp +++ b/dpcpp/factorization/cholesky_kernels.dp.cpp @@ -25,6 +25,15 @@ namespace dpcpp { namespace cholesky { +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); + + template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp index aa4aabfc731..293bb70ff8b 100644 --- a/omp/factorization/cholesky_kernels.cpp +++ b/omp/factorization/cholesky_kernels.cpp @@ -9,6 +9,8 @@ #include +#include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" @@ -28,6 +30,52 @@ namespace omp { namespace cholesky { +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + disjoint_sets sets(exec, size); + const auto nnz = static_cast(row_ptrs[size]); + vector> edges(nnz, exec); + // collect edge list + for (auto row : irange(static_cast(size))) { + for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { + const auto col = cols[nz]; + // edge contains (max, min) pair + auto edge = std::minmax(row, col, std::greater{}); + edges.push_back(edge); + } + } + // sort edge list ascending by edge weight + std::sort(edges.begin(), edges.end()); + // output helper array: Store row indices for output rows + // since we sorted by edge.first == row, this will be sorted + std::vector out_rows(size); + IndexType output_count{}; + // Kruskal algorithm: Connect unconnected components using edges with + // ascending weight + for (const auto edge : edges) { + const auto first_rep = sets.find(edge.first); + const auto second_rep = sets.find(edge.second); + if (first_rep != second_rep) { + // we are only interested in the lower triangle, so we add an edge + // max -> min + out_rows[output_count] = edge.first; + out_cols[output_count] = edge.second; + output_count++; + sets.join(first_rep, second_rep); + } + } + assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); + components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); + + template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index e4d7112a15f..e67778c1797 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -11,6 +11,7 @@ #include #include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" @@ -31,6 +32,52 @@ namespace reference { namespace cholesky { +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + disjoint_sets sets(exec, size); + const auto nnz = static_cast(row_ptrs[size]); + vector> edges(nnz, exec); + // collect edge list + for (auto row : irange(static_cast(size))) { + for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { + const auto col = cols[nz]; + // edge contains (max, min) pair + auto edge = std::minmax(row, col, std::greater{}); + edges.push_back(edge); + } + } + // sort edge list ascending by edge weight + std::sort(edges.begin(), edges.end()); + // output helper array: Store row indices for output rows + // since we sorted by edge.first == row, this will be sorted + std::vector out_rows(size); + IndexType output_count{}; + // Kruskal algorithm: Connect unconnected components using edges with + // ascending weight + for (const auto edge : edges) { + const auto first_rep = sets.find(edge.first); + const auto second_rep = sets.find(edge.second); + if (first_rep != second_rep) { + // we are only interested in the lower triangle, so we add an edge + // max -> min + out_rows[output_count] = edge.first; + out_cols[output_count] = edge.second; + output_count++; + sets.join(first_rep, second_rep); + } + } + assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); + components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); + + template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index 630aa2d0406..cf72c13a0b1 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -233,6 +233,37 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) +{ + using matrix_type = typename TestFixture::matrix_type; + using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; + using elimination_forest = typename TestFixture::elimination_forest; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + this->forall_matrices( + [this] { + auto skeleton = matrix_type::create( + this->ref, this->mtx->get_size(), this->mtx->get_size()[0]); + std::unique_ptr skeleton_forest; + gko::factorization::compute_elim_forest(this->mtx.get(), + this->forest); + + gko::kernels::reference::cholesky::compute_skeleton_tree( + this->ref, this->mtx->get_const_row_ptrs(), + this->mtx->get_const_col_idxs(), this->mtx->get_size()[0], + skeleton->get_row_ptrs(), skeleton->get_col_idxs()); + + gko::factorization::compute_elim_forest(this->mtx.get(), + this->forest); + gko::factorization::compute_elim_forest(skeleton.get(), + skeleton_forest); + + this->assert_equal_forests(*skeleton_forest, *this->forest); + }, + true); +} + + TYPED_TEST(Cholesky, KernelSymbolicCount) { using matrix_type = typename TestFixture::matrix_type; diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index 6d352c135e7..d408bc930f8 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -130,6 +130,30 @@ using Types = gko::test::cartesian_type_product_tmatrices) { + SCOPED_TRACE(pair.first); + const auto& mtx = pair.second; + const auto dmtx = gko::clone(this->exec, mtx); + const auto size = mtx->get_size(); + auto skeleton = matrix_type::create(this->ref, size, size[0]); + auto dskeleton = matrix_type::create(this->exec, size, size[0]); + + gko::kernels::reference::cholesky::compute_skeleton_tree( + this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + size[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs()); + gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::compute_skeleton_tree( + this->exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + size[0], dskeleton->get_row_ptrs(), dskeleton->get_col_idxs()); + + GKO_ASSERT_MTX_EQ_SPARSITY(skeleton, dskeleton); + } +} + + TYPED_TEST(CholeskySymbolic, KernelSymbolicCount) { using matrix_type = typename TestFixture::matrix_type; From 0bf134dbf450c98224483a9605cb5e3752260d3a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 5 Dec 2024 21:45:03 +0100 Subject: [PATCH 05/26] add more precise atomic operations --- .../cuda_hip/components/memory.nvidia.hpp.inc | 1680 +++++++++++++++-- dev_tools/scripts/generate_cuda_memory_ptx.py | 50 +- 2 files changed, 1580 insertions(+), 150 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index f759c613f45..cf8d80c64e9 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -96,6 +96,63 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) } +__device__ __forceinline__ int32 atomic_add_relaxed_shared(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_min_relaxed_shared(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; @@ -127,6 +184,63 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) } +__device__ __forceinline__ int64 atomic_add_relaxed_shared(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_min_relaxed_shared(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + __device__ __forceinline__ float load_relaxed_shared(const float* ptr) { float result; @@ -158,6 +272,25 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) } +__device__ __forceinline__ float atomic_add_relaxed_shared(float* ptr, + float value) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.f32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.f32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.f32 %0, [%1], %2;" +#endif + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "f"(value) + : "memory"); + return result; +} + + __device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; @@ -189,6 +322,277 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) } +__device__ __forceinline__ double atomic_add_relaxed_shared(double* ptr, + double value) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.f64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.f64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.f64 %0, [%1], %2;" +#endif + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "d"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 load_relaxed_shared(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.u32 %0, [%1];" +#else + "ld.relaxed.cta.shared.u32 %0, [%1];" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(uint32* ptr, uint32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.u32 [%0], %1;" +#else + "st.relaxed.cta.shared.u32 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint32 atomic_add_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_min_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_max_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_and_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.and.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.and.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.and.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.or.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.or.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.or.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.u64 %0, [%1];" +#else + "ld.relaxed.cta.shared.u64 %0, [%1];" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_shared(uint64* ptr, uint64 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.u64 [%0], %1;" +#else + "st.relaxed.cta.shared.u64 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 atomic_add_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_min_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_max_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_and_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.and.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.and.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.and.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.or.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.or.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.or.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; @@ -317,76 +721,968 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result) } -__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr) +__device__ __forceinline__ uint32 load_acquire_shared(const uint32* ptr) { - int32 result; + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.u32 %0, [%1];" +#else + "ld.acquire.cta.shared.u32 %0, [%1];" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(uint32* ptr, uint32 result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.u32 [%0], %1;" +#else + "st.release.cta.shared.u32 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 load_acquire_shared(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.u64 %0, [%1];" +#else + "ld.acquire.cta.shared.u64 %0, [%1];" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(uint64* ptr, uint64 result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.u64 [%0], %1;" +#else + "st.release.cta.shared.u64 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s32 %0, [%1];" +#else + "ld.relaxed.cta.s32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s32 [%0], %1;" +#else + "st.relaxed.cta.s32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 atomic_add_relaxed_local(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_min_relaxed_local(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s64 %0, [%1];" +#else + "ld.relaxed.cta.s64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s64 [%0], %1;" +#else + "st.relaxed.cta.s64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ int64 atomic_add_relaxed_local(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_min_relaxed_local(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ float load_relaxed_local(const float* ptr) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f32 %0, [%1];" +#else + "ld.relaxed.cta.f32 %0, [%1];" +#endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(float* ptr, float result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f32 [%0], %1;" +#else + "st.relaxed.cta.f32 [%0], %1;" +#endif + ::"l"(ptr), + "f"(result) + : "memory"); +} + + +__device__ __forceinline__ float atomic_add_relaxed_local(float* ptr, + float value) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.f32 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.f32 %0, [%1], %2;" +#endif + : "=f"(result) + : "l"(ptr), "f"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ double load_relaxed_local(const double* ptr) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f64 %0, [%1];" +#else + "ld.relaxed.cta.f64 %0, [%1];" +#endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(double* ptr, double result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f64 [%0], %1;" +#else + "st.relaxed.cta.f64 [%0], %1;" +#endif + ::"l"(ptr), + "d"(result) + : "memory"); +} + + +__device__ __forceinline__ double atomic_add_relaxed_local(double* ptr, + double value) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.f64 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.f64 %0, [%1], %2;" +#endif + : "=d"(result) + : "l"(ptr), "d"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 load_relaxed_local(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u32 %0, [%1];" +#else + "ld.relaxed.cta.u32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(uint32* ptr, uint32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u32 [%0], %1;" +#else + "st.relaxed.cta.u32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint32 atomic_add_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_min_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_max_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_and_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.and.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.and.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.and.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.or.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.or.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.or.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u64 %0, [%1];" +#else + "ld.relaxed.cta.u64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(uint64* ptr, uint64 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u64 [%0], %1;" +#else + "st.relaxed.cta.u64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 atomic_add_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_min_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_max_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_and_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.and.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.and.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.and.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.or.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.or.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.or.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 load_acquire_local(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s32 %0, [%1];" +#else + "ld.acquire.cta.s32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(int32* ptr, int32 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s32 [%0], %1;" +#else + "st.release.cta.s32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int64 load_acquire_local(const int64* ptr) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s64 %0, [%1];" +#else + "ld.acquire.cta.s64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(int64* ptr, int64 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s64 [%0], %1;" +#else + "st.release.cta.s64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ float load_acquire_local(const float* ptr) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f32 %0, [%1];" +#else + "ld.acquire.cta.f32 %0, [%1];" +#endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(float* ptr, float result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f32 [%0], %1;" +#else + "st.release.cta.f32 [%0], %1;" +#endif + ::"l"(ptr), + "f"(result) + : "memory"); +} + + +__device__ __forceinline__ double load_acquire_local(const double* ptr) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f64 %0, [%1];" +#else + "ld.acquire.cta.f64 %0, [%1];" +#endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(double* ptr, double result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f64 [%0], %1;" +#else + "st.release.cta.f64 [%0], %1;" +#endif + ::"l"(ptr), + "d"(result) + : "memory"); +} + + +__device__ __forceinline__ uint32 load_acquire_local(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u32 %0, [%1];" +#else + "ld.acquire.cta.u32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(uint32* ptr, uint32 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u32 [%0], %1;" +#else + "st.release.cta.u32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 load_acquire_local(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u64 %0, [%1];" +#else + "ld.acquire.cta.u64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(uint64* ptr, uint64 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u64 [%0], %1;" +#else + "st.release.cta.u64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 load_relaxed(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s32 %0, [%1];" +#else + "ld.relaxed.gpu.s32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s32 [%0], %1;" +#else + "st.relaxed.gpu.s32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 atomic_add_relaxed(int32* ptr, int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.s32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.add.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_min_relaxed(int32* ptr, int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.s32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.min.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.s32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.max.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 load_relaxed(const int64* ptr) +{ + int64 result; asm volatile( #if __CUDA_ARCH__ < 700 - "ld.volatile.s32 %0, [%1];" + "ld.volatile.s64 %0, [%1];" #else - "ld.relaxed.cta.s32 %0, [%1];" + "ld.relaxed.gpu.s64 %0, [%1];" #endif - : "=r"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(const_cast(ptr)) : "memory"); return result; } -__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result) +__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.s32 [%0], %1;" + "st.volatile.s64 [%0], %1;" #else - "st.relaxed.cta.s32 [%0], %1;" + "st.relaxed.gpu.s64 [%0], %1;" #endif ::"l"(ptr), - "r"(result) + "l"(result) : "memory"); } -__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) +__device__ __forceinline__ int64 atomic_add_relaxed(int64* ptr, int64 value) { int64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.u64 %0, [%1], %2;" #else - "ld.relaxed.cta.s64 %0, [%1];" + "atom.relaxed.gpu.add.u64 %0, [%1], %2;" #endif : "=l"(result) - : "l"(const_cast(ptr)) + : "l"(ptr), "l"(value) : "memory"); + return result; +} + +__device__ __forceinline__ int64 atomic_min_relaxed(int64* ptr, int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.s64 %0, [%1], %2;" +#else + "atom.relaxed.gpu.min.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); return result; } -__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result) +__device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value) { + int64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.s64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.max.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.s64 %0, [%1], %2;" #else - "st.relaxed.cta.s64 [%0], %1;" + "atom.relaxed.gpu.max.s64 %0, [%1], %2;" #endif - ::"l"(ptr), - "l"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ float load_relaxed_local(const float* ptr) +__device__ __forceinline__ float load_relaxed(const float* ptr) { float result; asm volatile( #if __CUDA_ARCH__ < 700 "ld.volatile.f32 %0, [%1];" #else - "ld.relaxed.cta.f32 %0, [%1];" + "ld.relaxed.gpu.f32 %0, [%1];" #endif : "=f"(result) : "l"(const_cast(ptr)) @@ -396,13 +1692,13 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr) } -__device__ __forceinline__ void store_relaxed_local(float* ptr, float result) +__device__ __forceinline__ void store_relaxed(float* ptr, float result) { asm volatile( #if __CUDA_ARCH__ < 700 "st.volatile.f32 [%0], %1;" #else - "st.relaxed.cta.f32 [%0], %1;" + "st.relaxed.gpu.f32 [%0], %1;" #endif ::"l"(ptr), "f"(result) @@ -410,14 +1706,32 @@ __device__ __forceinline__ void store_relaxed_local(float* ptr, float result) } -__device__ __forceinline__ double load_relaxed_local(const double* ptr) +__device__ __forceinline__ float atomic_add_relaxed(float* ptr, float value) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.f32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.add.f32 %0, [%1], %2;" +#endif + : "=f"(result) + : "l"(ptr), "f"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ double load_relaxed(const double* ptr) { double result; asm volatile( #if __CUDA_ARCH__ < 700 "ld.volatile.f64 %0, [%1];" #else - "ld.relaxed.cta.f64 %0, [%1];" + "ld.relaxed.gpu.f64 %0, [%1];" #endif : "=d"(result) : "l"(const_cast(ptr)) @@ -427,13 +1741,13 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr) } -__device__ __forceinline__ void store_relaxed_local(double* ptr, double result) +__device__ __forceinline__ void store_relaxed(double* ptr, double result) { asm volatile( #if __CUDA_ARCH__ < 700 "st.volatile.f64 [%0], %1;" #else - "st.relaxed.cta.f64 [%0], %1;" + "st.relaxed.gpu.f64 [%0], %1;" #endif ::"l"(ptr), "d"(result) @@ -441,255 +1755,263 @@ __device__ __forceinline__ void store_relaxed_local(double* ptr, double result) } -__device__ __forceinline__ int32 load_acquire_local(const int32* ptr) +__device__ __forceinline__ double atomic_add_relaxed(double* ptr, double value) { - int32 result; + double result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.add.f64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.f64 %0, [%1], %2;" #else - "ld.acquire.cta.s32 %0, [%1];" + "atom.relaxed.gpu.add.f64 %0, [%1], %2;" #endif - : "=r"(result) - : "l"(const_cast(ptr)) + : "=d"(result) + : "l"(ptr), "d"(value) : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(int32* ptr, int32 result) +__device__ __forceinline__ uint32 load_relaxed(const uint32* ptr) { - membar_acq_rel_local(); + uint32 result; asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.s32 [%0], %1;" + "ld.volatile.u32 %0, [%1];" #else - "st.release.cta.s32 [%0], %1;" + "ld.relaxed.gpu.u32 %0, [%1];" #endif - ::"l"(ptr), - "r"(result) + : "=r"(result) + : "l"(const_cast(ptr)) : "memory"); -} - -__device__ __forceinline__ int64 load_acquire_local(const int64* ptr) -{ - int64 result; - asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s64 %0, [%1];" -#else - "ld.acquire.cta.s64 %0, [%1];" -#endif - : "=l"(result) - : "l"(const_cast(ptr)) - : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(int64* ptr, int64 result) +__device__ __forceinline__ void store_relaxed(uint32* ptr, uint32 result) { - membar_acq_rel_local(); asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.s64 [%0], %1;" + "st.volatile.u32 [%0], %1;" #else - "st.release.cta.s64 [%0], %1;" + "st.relaxed.gpu.u32 [%0], %1;" #endif ::"l"(ptr), - "l"(result) + "r"(result) : "memory"); } -__device__ __forceinline__ float load_acquire_local(const float* ptr) +__device__ __forceinline__ uint32 atomic_add_relaxed(uint32* ptr, uint32 value) { - float result; + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.add.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.u32 %0, [%1], %2;" #else - "ld.acquire.cta.f32 %0, [%1];" + "atom.relaxed.gpu.add.u32 %0, [%1], %2;" #endif - : "=f"(result) - : "l"(const_cast(ptr)) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(float* ptr, float result) +__device__ __forceinline__ uint32 atomic_min_relaxed(uint32* ptr, uint32 value) { - membar_acq_rel_local(); + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f32 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.min.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.u32 %0, [%1], %2;" #else - "st.release.cta.f32 [%0], %1;" + "atom.relaxed.gpu.min.u32 %0, [%1], %2;" #endif - ::"l"(ptr), - "f"(result) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); + return result; } -__device__ __forceinline__ double load_acquire_local(const double* ptr) +__device__ __forceinline__ uint32 atomic_max_relaxed(uint32* ptr, uint32 value) { - double result; + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.max.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.u32 %0, [%1], %2;" #else - "ld.acquire.cta.f64 %0, [%1];" + "atom.relaxed.gpu.max.u32 %0, [%1], %2;" #endif - : "=d"(result) - : "l"(const_cast(ptr)) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(double* ptr, double result) +__device__ __forceinline__ uint32 atomic_and_relaxed(uint32* ptr, uint32 value) { - membar_acq_rel_local(); + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.and.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.and.u32 %0, [%1], %2;" #else - "st.release.cta.f64 [%0], %1;" + "atom.relaxed.gpu.and.u32 %0, [%1], %2;" #endif - ::"l"(ptr), - "d"(result) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); + return result; } -__device__ __forceinline__ int32 load_relaxed(const int32* ptr) +__device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value) { - int32 result; + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.or.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.or.u32 %0, [%1], %2;" #else - "ld.relaxed.gpu.s32 %0, [%1];" + "atom.relaxed.gpu.or.u32 %0, [%1], %2;" #endif : "=r"(result) - : "l"(const_cast(ptr)) + : "l"(ptr), "r"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) +__device__ __forceinline__ uint64 load_relaxed(const uint64* ptr) { + uint64 result; asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.s32 [%0], %1;" + "ld.volatile.u64 %0, [%1];" #else - "st.relaxed.gpu.s32 [%0], %1;" + "ld.relaxed.gpu.u64 %0, [%1];" #endif - ::"l"(ptr), - "r"(result) + : "=l"(result) + : "l"(const_cast(ptr)) : "memory"); + + return result; } -__device__ __forceinline__ int64 load_relaxed(const int64* ptr) +__device__ __forceinline__ void store_relaxed(uint64* ptr, uint64 result) { - int64 result; asm volatile( #if __CUDA_ARCH__ < 700 - "ld.volatile.s64 %0, [%1];" + "st.volatile.u64 [%0], %1;" #else - "ld.relaxed.gpu.s64 %0, [%1];" + "st.relaxed.gpu.u64 [%0], %1;" #endif - : "=l"(result) - : "l"(const_cast(ptr)) + ::"l"(ptr), + "l"(result) : "memory"); - - return result; } -__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) +__device__ __forceinline__ uint64 atomic_add_relaxed(uint64* ptr, uint64 value) { + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.s64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.u64 %0, [%1], %2;" #else - "st.relaxed.gpu.s64 [%0], %1;" + "atom.relaxed.gpu.add.u64 %0, [%1], %2;" #endif - ::"l"(ptr), - "l"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ float load_relaxed(const float* ptr) +__device__ __forceinline__ uint64 atomic_min_relaxed(uint64* ptr, uint64 value) { - float result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.min.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.u64 %0, [%1], %2;" #else - "ld.relaxed.gpu.f32 %0, [%1];" + "atom.relaxed.gpu.min.u64 %0, [%1], %2;" #endif - : "=f"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed(float* ptr, float result) +__device__ __forceinline__ uint64 atomic_max_relaxed(uint64* ptr, uint64 value) { + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f32 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.max.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.u64 %0, [%1], %2;" #else - "st.relaxed.gpu.f32 [%0], %1;" + "atom.relaxed.gpu.max.u64 %0, [%1], %2;" #endif - ::"l"(ptr), - "f"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ double load_relaxed(const double* ptr) +__device__ __forceinline__ uint64 atomic_and_relaxed(uint64* ptr, uint64 value) { - double result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.and.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.and.u64 %0, [%1], %2;" #else - "ld.relaxed.gpu.f64 %0, [%1];" + "atom.relaxed.gpu.and.u64 %0, [%1], %2;" #endif - : "=d"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed(double* ptr, double result) +__device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value) { + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.or.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.or.u64 %0, [%1], %2;" #else - "st.relaxed.gpu.f64 [%0], %1;" + "atom.relaxed.gpu.or.u64 %0, [%1], %2;" #endif - ::"l"(ptr), - "d"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } @@ -821,6 +2143,70 @@ __device__ __forceinline__ void store_release(double* ptr, double result) } +__device__ __forceinline__ uint32 load_acquire(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u32 %0, [%1];" +#else + "ld.acquire.gpu.u32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(uint32* ptr, uint32 result) +{ + membar_acq_rel(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u32 [%0], %1;" +#else + "st.release.gpu.u32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 load_acquire(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u64 %0, [%1];" +#else + "ld.acquire.gpu.u64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(uint64* ptr, uint64 result) +{ + membar_acq_rel(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u64 [%0], %1;" +#else + "st.release.gpu.u64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + __device__ __forceinline__ thrust::complex load_relaxed_shared( const thrust::complex* ptr) { diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 834c49dba46..ed6a6cb32cf 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -17,6 +17,8 @@ class ordering: fn_load_suffix: str ptx_store_suffix: str fn_store_suffix: str + ptx_loadstore_suffix: str + fn_loadstore_suffix: str is_relaxed: bool @@ -27,6 +29,13 @@ class type_desc: name: str +@dataclasses.dataclass +class operation: + fn_op_suffix: str + ptx_op_suffix: str + supports_float: bool + supports_signed: bool + memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"), @@ -35,14 +44,23 @@ class type_desc: space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", - ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", + ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True), ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire", - ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False) + ptx_store_suffix=".release", fn_store_suffix="_release", + ptx_loadstore_suffix=".acq_rel", fn_loadstore_suffix="_acqrel", is_relaxed=False) ] types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"), type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"), type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), - type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] + type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double"), + type_desc(ptx_type_suffix=".u32", val_constraint="r", name="uint32"), + type_desc(ptx_type_suffix=".u64", val_constraint="l", name="uint64")] +operations = [operation(fn_op_suffix="_add", ptx_op_suffix=".add", supports_float=True, supports_signed=True), + operation(fn_op_suffix="_min", ptx_op_suffix=".min", supports_float=False, supports_signed=True), + operation(fn_op_suffix="_max", ptx_op_suffix=".max", supports_float=False, supports_signed=True), + operation(fn_op_suffix="_and", ptx_op_suffix=".and", supports_float=False, supports_signed=False), + operation(fn_op_suffix="_or", ptx_op_suffix=".or", supports_float=False, supports_signed=False)] # header print("""// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors // @@ -149,6 +167,32 @@ class type_desc: :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); }} +""") + for op in operations: + + if (not t.ptx_type_suffix.startswith(".f") or op.supports_float) and (not t.ptx_type_suffix.startswith(".s") or op.supports_signed): + # for some reason there is no signed 64 bit atomic support, + # but since the operations are equivalent in two's complement, we can use unsigned instead + new_type = ".u64" if t.ptx_type_suffix == ".s64" and op.fn_op_suffix == "_add" else t.ptx_type_suffix + # non-relaxed atomics are unsupported with SM 6.0 and below + if o.is_relaxed: + print(f""" +__device__ __forceinline__ {t.name} atomic{op.fn_op_suffix}{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} value) +{{ + {t.name} result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;" +#else + "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;" +#endif + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(value) + : "memory"); + return result; +}} """) # vectorized relaxed loads for thrust::complex From 9caaa2d0b4eace5e302091d110d16a5ff5aa1048 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 19 Dec 2024 19:47:58 +0100 Subject: [PATCH 06/26] fix matrix symmetry --- test/factorization/cholesky_kernels.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index d408bc930f8..24cb4ae9b6a 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -45,14 +45,17 @@ class CholeskySymbolic : public CommonTestFixture { matrices.emplace_back( "example small", gko::initialize( - {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 0, 0, 1}}, ref)); + {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 1, 0, 1}}, ref)); + // this is the example from Liu 1990 https://doi.org/10.1137/0611010. + // "The Role of Elimination Trees in Sparse Factorization" + matrices.emplace_back("example", gko::initialize( {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0}, - {0, 1, 0, 1, 0, 0, 0, 0, 0, 1}, - {1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 0, 0, 1, 0, 0, 0, 0, 1}, + {1, 0, 1, 0, 0, 0, 1, 0, 0, 0}, {0, 0, 0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 0, 1, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 0, 1, 0, 1, 0, 0}, + {0, 0, 0, 0, 0, 1, 1, 1, 0, 0}, {0, 0, 1, 0, 0, 1, 1, 0, 0, 0}, {1, 0, 0, 0, 0, 1, 0, 1, 1, 1}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 0}, @@ -65,7 +68,7 @@ class CholeskySymbolic : public CommonTestFixture { {0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, {0, 0, 0, 1, 1, 1, 0, 0, 0, 1}, {0, 0, 0, 0, 1, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 1, 0, 0, 1}, + {0, 0, 0, 0, 0, 0, 1, 1, 0, 1}, {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}}, From f2d970c4750ad6eb08c73a8fd11a77fd9055fc18 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 19 Dec 2024 20:08:29 +0100 Subject: [PATCH 07/26] fix reference MST algorithm --- reference/factorization/cholesky_kernels.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index e67778c1797..94d8903ca07 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -40,7 +40,8 @@ void compute_skeleton_tree(std::shared_ptr exec, { disjoint_sets sets(exec, size); const auto nnz = static_cast(row_ptrs[size]); - vector> edges(nnz, exec); + vector> edges(exec); + edges.reserve(nnz); // collect edge list for (auto row : irange(static_cast(size))) { for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { From 0af05e2adb63d3ebf443f36a944152d636d61ec3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 19 Dec 2024 20:24:50 +0100 Subject: [PATCH 08/26] add GPU MST algorithm for Cholesky preprocessing --- .../factorization/cholesky_kernels.cpp | 271 +++++++++++++++++- test/factorization/cholesky_kernels.cpp | 49 +++- 2 files changed, 308 insertions(+), 12 deletions(-) diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index d5badf6ef43..61a1012e435 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -5,6 +5,7 @@ #include "core/factorization/cholesky_kernels.hpp" #include +#include #include #include @@ -21,6 +22,7 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/syncfree.hpp" #include "common/cuda_hip/components/thread_ids.hpp" @@ -48,6 +50,163 @@ constexpr int default_block_size = 512; namespace kernel { +template +__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( + const IndexType* __restrict__ rows, const IndexType* __restrict__ cols, + IndexType size, IndexType* __restrict__ worklist_sources, + IndexType* __restrict__ worklist_targets, + IndexType* __restrict__ worklist_edge_ids, + IndexType* __restrict__ worklist_counter) +{ + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto row = rows[i]; + const auto col = cols[i]; + if (col < row) { + const auto out_i = atomic_add_relaxed(worklist_counter, 1); + worklist_sources[out_i] = row; + worklist_targets[out_i] = col; + worklist_edge_ids[out_i] = i; + } +} + + +template +__device__ IndexType mst_find(const IndexType* parents, IndexType node) +{ + auto parent = parents[node]; + while (parent != node) { + node = parent; + parent = parents[node]; + }; + return parent; +} + + +template +__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node) +{ + auto parent = load_relaxed_local(parents + node); + while (parent != node) { + node = parent; + parent = load_relaxed_local(parents + node); + }; + return parent; +} + + +template +__device__ void guarded_atomic_min(IndexType* ptr, IndexType value) +{ + // only execute the atomic if we know that it might have an effect + if (load_relaxed_local(ptr) > value) { + atomic_min_relaxed(ptr, value); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_find_minimum( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, + const IndexType* __restrict__ in_edge_ids, IndexType size, + const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge, + IndexType* __restrict__ worklist_sources, + IndexType* __restrict__ worklist_targets, + IndexType* __restrict__ worklist_edge_ids, + IndexType* __restrict__ worklist_counter) +{ + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + const auto edge_id = in_edge_ids[i]; + const auto source_rep = mst_find(parents, source); + const auto target_rep = mst_find(parents, target); + if (source_rep != target_rep) { + const auto out_i = atomic_add_relaxed(worklist_counter, 1); + worklist_sources[out_i] = source_rep; + worklist_targets[out_i] = target_rep; + worklist_edge_ids[out_i] = edge_id; + guarded_atomic_min(min_edge + source_rep, edge_id); + guarded_atomic_min(min_edge + target_rep, edge_id); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_join_edges( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, + const IndexType* __restrict__ in_edge_ids, IndexType size, + IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge, + const IndexType* __restrict__ edge_sources, + const IndexType* __restrict__ edge_targets, + IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets, + IndexType* __restrict__ out_counter) +{ + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + const auto edge_id = in_edge_ids[i]; + if (min_edge[source] == edge_id || min_edge[target] == edge_id) { + // join source and sink + const auto source_rep = mst_find_relaxed(parents, source); + const auto target_rep = mst_find_relaxed(parents, target); + assert(source_rep != target_rep); + const auto new_rep = min(source_rep, target_rep); + auto old_rep = max(source_rep, target_rep); + bool repeat = false; + do { + repeat = false; + using atomic_type = + std::conditional_t, int32, + unsigned long long>; + auto old_parent = + atomicCAS(reinterpret_cast(parents + old_rep), + static_cast(old_rep), + static_cast(new_rep)); + // if this fails, the parent of old_rep changed recently, so we need + // to try again by updating the parent's parent (hopefully its rep) + if (old_parent != old_rep) { + old_rep = old_parent; + repeat = true; + } + } while (repeat); + const auto out_i = atomic_add_relaxed(out_counter, 1); + out_sources[out_i] = edge_sources[edge_id]; + out_targets[out_i] = edge_targets[edge_id]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, IndexType size, + IndexType* __restrict__ min_edge) +{ + constexpr auto sentinel = std::numeric_limits::max(); + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + // we could write the values non-atomically, but this makes race checkers + // happier without a performance penalty (hopefully, thanks to _local) + store_relaxed_local(min_edge + source, sentinel); + store_relaxed_local(min_edge + target, sentinel); +} + + template __global__ __launch_bounds__(default_block_size) void build_postorder_cols( IndexType num_rows, const IndexType* cols, const IndexType* row_ptrs, @@ -230,6 +389,109 @@ __global__ __launch_bounds__(default_block_size) void factorize( } // namespace kernel +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + const auto policy = thrust_policy(exec); + const auto nnz = exec->copy_val_to_host(row_ptrs + size); + // convert edges to COO representation + // the edge list is sorted, since we only consider edges where row > col, + // and the row array (= weights) is sorted coming from row_ptrs + array row_array{exec, static_cast(nnz)}; + const auto rows = row_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows); + // we assume the matrix is symmetric, so we can remove every second edge + // also round up the worklist size for equal cache alignment between fields + const auto worklist_size = + ceildiv(nnz, config::warp_size * 2) * config::warp_size; + // create 2 worklists consisting of (start, end, edge_id) + array worklist{exec, static_cast(worklist_size * 8)}; + auto wl1_source = worklist.get_data(); + auto wl1_target = wl1_source + worklist_size; + auto wl1_edge_id = wl1_target + worklist_size; + auto wl2_source = wl1_source + 3 * worklist_size; + auto wl2_target = wl1_target + 3 * worklist_size; + auto wl2_edge_id = wl1_edge_id + 3 * worklist_size; + // atomic counters for worklists and output edge list + array counters{exec, 3}; + auto wl1_counter = counters.get_data(); + auto wl2_counter = wl1_counter + 1; + auto output_counter = wl2_counter + 1; + components::fill_array(exec, counters.get_data(), counters.get_size(), + IndexType{}); + // helpers for interacting with worklists + const auto clear_wl1 = [&] { + IndexType value{}; + exec->copy_from(exec->get_master(), 1, &value, wl1_counter); + }; + const auto clear_wl2 = [&] { + IndexType value{}; + exec->copy_from(exec->get_master(), 1, &value, wl2_counter); + }; + const auto get_wl1_size = [&] { + return exec->copy_val_to_host(wl1_counter); + }; + const auto swap_wl1_wl2 = [&] { + std::swap(wl1_source, wl2_source); + std::swap(wl1_target, wl2_target); + std::swap(wl1_edge_id, wl2_edge_id); + std::swap(wl1_counter, wl2_counter); + }; + // initialize every node to a singleton set + array parent_array{exec, size}; + const auto parents = parent_array.get_data(); + components::fill_seq_array(exec, parents, size); + // array storing the minimal edge adjacent to each node + array min_edge_array{exec, size}; + const auto min_edges = min_edge_array.get_data(); + constexpr auto min_edge_sentinel = std::numeric_limits::max(); + components::fill_array(exec, min_edges, size, min_edge_sentinel); + // output row array, to be used in conjunction with out_cols in COO storage + array out_row_array{exec, size}; + const auto out_rows = out_row_array.get_data(); + // initialize worklist1 with forward edges + { + const auto num_blocks = ceildiv(nnz, default_block_size); + kernel::mst_initialize_worklist<<>>( + rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter); + } + auto wl1_size = get_wl1_size(); + while (wl1_size > 0) { + clear_wl2(); + // attach each node to its smallest adjacent non-cycle edge + { + const auto num_blocks = ceildiv(wl1_size, default_block_size); + kernel::mst_find_minimum<<>>( + wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, + min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter); + } + clear_wl1(); + swap_wl1_wl2(); + wl1_size = get_wl1_size(); + if (wl1_size > 0) { + // join minimal edges + const auto num_blocks = ceildiv(wl1_size, default_block_size); + kernel::mst_join_edges<<>>( + wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, + min_edges, rows, cols, out_rows, out_cols, output_counter); + kernel::mst_reset_min_edges<<>>( + wl1_source, wl1_target, wl1_size, min_edges); + } + } + const auto num_mst_edges = exec->copy_val_to_host(output_counter); + thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows); + thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges, + out_cols); + components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); + + template void symbolic_factorize( std::shared_ptr exec, @@ -390,15 +652,6 @@ void factorize(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); -template -void compute_skeleton_tree(std::shared_ptr exec, - const IndexType* row_ptrs, const IndexType* cols, - size_type size, IndexType* out_row_ptrs, - IndexType* out_cols) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); - - template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index 24cb4ae9b6a..a08b71d0959 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -14,6 +14,7 @@ #include #include +#include "core/components/disjoint_sets.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" @@ -111,6 +112,27 @@ class CholeskySymbolic : public CommonTestFixture { } } + index_type check_mst(const std::unique_ptr& mst) const + { + gko::matrix_data mst_data; + const auto size = mst->get_size(); + mst->write(mst_data); + gko::disjoint_sets sets(this->ref, size[0]); + index_type weight_sum{}; + // need an IIFE because the assertions would return something + [&] { + for (const auto entry : mst_data.nonzeros) { + ASSERT_GT(entry.row, entry.column); + weight_sum += entry.row; + const auto row_rep = sets.find(entry.row); + const auto col_rep = sets.find(entry.column); + ASSERT_NE(row_rep, col_rep); + sets.join(row_rep, col_rep); + } + }(); + return weight_sum; + } + std::vector>> matrices; gko::array tmp; @@ -136,14 +158,18 @@ TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator); TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree) { using matrix_type = typename TestFixture::matrix_type; + using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; + using elimination_forest = typename TestFixture::elimination_forest; for (const auto& pair : this->matrices) { SCOPED_TRACE(pair.first); const auto& mtx = pair.second; + // check for correctness: is mtx symmetric? + GKO_ASSERT_MTX_EQ_SPARSITY(mtx, gko::as(mtx->transpose())); const auto dmtx = gko::clone(this->exec, mtx); const auto size = mtx->get_size(); - auto skeleton = matrix_type::create(this->ref, size, size[0]); - auto dskeleton = matrix_type::create(this->exec, size, size[0]); + const auto skeleton = matrix_type::create(this->ref, size, size[0]); + const auto dskeleton = matrix_type::create(this->exec, size, size[0]); gko::kernels::reference::cholesky::compute_skeleton_tree( this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), @@ -152,7 +178,24 @@ TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree) this->exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), size[0], dskeleton->get_row_ptrs(), dskeleton->get_col_idxs()); - GKO_ASSERT_MTX_EQ_SPARSITY(skeleton, dskeleton); + // check that the created graphs are trees and have the same edge sum + const auto weight_sum = this->check_mst(skeleton); + const auto dweight_sum = this->check_mst(dskeleton); + ASSERT_EQ(weight_sum, dweight_sum); + // while the MSTs may be different, they should produce the same + // elimination forest as the original matrix + std::unique_ptr forest; + std::unique_ptr skeleton_forest; + std::unique_ptr dskeleton_forest; + gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elim_forest(skeleton.get(), + skeleton_forest); + gko::factorization::compute_elim_forest(dskeleton.get(), + dskeleton_forest); + // the parents array fully determines the elimination forest + GKO_ASSERT_ARRAY_EQ(forest->parents, skeleton_forest->parents); + GKO_ASSERT_ARRAY_EQ(skeleton_forest->parents, + dskeleton_forest->parents); } } From 7158492325d58697df13b7e3c81877396cc6b1e4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 19 Dec 2024 21:12:35 +0100 Subject: [PATCH 09/26] add AMD support --- .../factorization/cholesky_kernels.cpp | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 61a1012e435..54fa3c8caa9 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -58,6 +58,8 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( IndexType* __restrict__ worklist_edge_ids, IndexType* __restrict__ worklist_counter) { + using atomic_type = std::conditional_t, + int32, unsigned long long>; const auto i = thread::get_thread_id_flat(); if (i >= size) { return; @@ -65,7 +67,8 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( const auto row = rows[i]; const auto col = cols[i]; if (col < row) { - const auto out_i = atomic_add_relaxed(worklist_counter, 1); + const auto out_i = static_cast(atomicAdd( + reinterpret_cast(worklist_counter), atomic_type{1})); worklist_sources[out_i] = row; worklist_targets[out_i] = col; worklist_edge_ids[out_i] = i; @@ -100,9 +103,12 @@ __device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node) template __device__ void guarded_atomic_min(IndexType* ptr, IndexType value) { + using atomic_type = std::conditional_t, + int32, unsigned long long>; // only execute the atomic if we know that it might have an effect if (load_relaxed_local(ptr) > value) { - atomic_min_relaxed(ptr, value); + atomicMin(reinterpret_cast(ptr), + static_cast(value)); } } @@ -118,6 +124,8 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum( IndexType* __restrict__ worklist_edge_ids, IndexType* __restrict__ worklist_counter) { + using atomic_type = std::conditional_t, + int32, unsigned long long>; const auto i = thread::get_thread_id_flat(); if (i >= size) { return; @@ -128,7 +136,8 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum( const auto source_rep = mst_find(parents, source); const auto target_rep = mst_find(parents, target); if (source_rep != target_rep) { - const auto out_i = atomic_add_relaxed(worklist_counter, 1); + const auto out_i = static_cast(atomicAdd( + reinterpret_cast(worklist_counter), atomic_type{1})); worklist_sources[out_i] = source_rep; worklist_targets[out_i] = target_rep; worklist_edge_ids[out_i] = edge_id; @@ -149,6 +158,8 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges( IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets, IndexType* __restrict__ out_counter) { + using atomic_type = std::conditional_t, + int32, unsigned long long>; const auto i = thread::get_thread_id_flat(); if (i >= size) { return; @@ -166,9 +177,6 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges( bool repeat = false; do { repeat = false; - using atomic_type = - std::conditional_t, int32, - unsigned long long>; auto old_parent = atomicCAS(reinterpret_cast(parents + old_rep), static_cast(old_rep), @@ -180,7 +188,8 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges( repeat = true; } } while (repeat); - const auto out_i = atomic_add_relaxed(out_counter, 1); + const auto out_i = static_cast(atomicAdd( + reinterpret_cast(out_counter), atomic_type{1})); out_sources[out_i] = edge_sources[edge_id]; out_targets[out_i] = edge_targets[edge_id]; } From 7d96d1d7dd6fb56dcb63679c1a72b5ce18506c7a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 19 Dec 2024 21:39:42 +0100 Subject: [PATCH 10/26] add benchmark for MST-enhanced symbolic Cholesky --- benchmark/sparse_blas/operations.cpp | 31 +++++++++++++---- core/factorization/symbolic.cpp | 51 ++++++++++++++++++++++++++++ core/factorization/symbolic.hpp | 16 +++++++++ 3 files changed, 92 insertions(+), 6 deletions(-) diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index 61eeaeeb627..1b4e5819793 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -614,8 +614,9 @@ class SymbolicLuNearSymmOperation : public BenchmarkOperation { class SymbolicCholeskyOperation : public BenchmarkOperation { public: - explicit SymbolicCholeskyOperation(const Mtx* mtx, bool symmetric) - : mtx_{mtx}, symmetric_{symmetric}, result_{} + explicit SymbolicCholeskyOperation(const Mtx* mtx, bool device, + bool symmetric) + : mtx_{mtx}, symmetric_{symmetric}, device_{device}, result_{} {} std::pair validate() const override @@ -643,8 +644,13 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { void run() override { - gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_, - forest_); + if (device_) { + gko::factorization::symbolic_cholesky_device(mtx_, symmetric_, + result_, forest_); + } else { + gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_, + forest_); + } } void write_stats(json& object) override @@ -655,6 +661,7 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { private: const Mtx* mtx_; bool symmetric_; + bool device_; std::unique_ptr result_; std::unique_ptr> forest_; }; @@ -791,13 +798,25 @@ const std::map(mtx); }}, + {"symbolic_cholesky_device", + [](const Mtx* mtx) { + return std::make_unique(mtx, true, + false); + }}, + {"symbolic_cholesky_device_symmetric", + [](const Mtx* mtx) { + return std::make_unique(mtx, true, + true); + }}, {"symbolic_cholesky", [](const Mtx* mtx) { - return std::make_unique(mtx, false); + return std::make_unique(mtx, false, + false); }}, {"symbolic_cholesky_symmetric", [](const Mtx* mtx) { - return std::make_unique(mtx, true); + return std::make_unique(mtx, false, + true); }}, {"reorder_rcm", [](const Mtx* mtx) { diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index b08602e1a1a..3d59ec78ac0 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -24,6 +24,7 @@ namespace factorization { namespace { +GKO_REGISTER_OPERATION(compute_skeleton_tree, cholesky::compute_skeleton_tree); GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count); GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize); GKO_REGISTER_OPERATION(prefix_sum_nonnegative, @@ -80,6 +81,56 @@ void symbolic_cholesky( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY); +template +void symbolic_cholesky_device( + const matrix::Csr* mtx, bool symmetrize, + std::unique_ptr>& factors, + std::unique_ptr>& forest) +{ + using matrix_type = matrix::Csr; + GKO_ASSERT_IS_SQUARE_MATRIX(mtx); + const auto exec = mtx->get_executor(); + const auto num_rows = mtx->get_size()[0]; + const auto host_exec = exec->get_master(); + { + const auto skeleton = + matrix_type::create(exec, mtx->get_size(), num_rows); + exec->run(make_compute_skeleton_tree( + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), num_rows, + skeleton->get_row_ptrs(), skeleton->get_col_idxs())); + exec->run(make_compute_elim_forest(skeleton.get(), forest)); + } + array row_ptrs{exec, num_rows + 1}; + array tmp{exec}; + exec->run(make_symbolic_count(mtx, *forest, row_ptrs.get_data(), tmp)); + exec->run(make_prefix_sum_nonnegative(row_ptrs.get_data(), num_rows + 1)); + const auto factor_nnz = + static_cast(get_element(row_ptrs, num_rows)); + factors = matrix_type::create( + exec, mtx->get_size(), array{exec, factor_nnz}, + array{exec, factor_nnz}, std::move(row_ptrs)); + exec->run(make_symbolic(mtx, *forest, factors.get(), tmp)); + factors->sort_by_column_index(); + if (symmetrize) { + auto lt_factor = as(factors->transpose()); + const auto scalar = + initialize>({one()}, exec); + const auto id = matrix::Identity::create(exec, num_rows); + lt_factor->apply(scalar, id, scalar, factors); + } +} + + +#define GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE(ValueType, IndexType) \ + void symbolic_cholesky_device( \ + const matrix::Csr* mtx, bool symmetrize, \ + std::unique_ptr>& factors, \ + std::unique_ptr>& forest) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE); + + template void symbolic_lu_near_symm( const matrix::Csr* mtx, diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp index 096d8c998bc..84478e214e6 100644 --- a/core/factorization/symbolic.hpp +++ b/core/factorization/symbolic.hpp @@ -25,6 +25,22 @@ void symbolic_cholesky( std::unique_ptr>& factors, std::unique_ptr>& forest); + +/** + * Computes the symbolic Cholesky factorization of the given matrix on the + * device. + * + * @param mtx the input matrix + * @param symmetrize output the pattern of L + L^T (true) or just L (false)? + * @param factors the output factor(s) + * @param forest the elimination forest of the input matrix + */ +template +void symbolic_cholesky_device( + const matrix::Csr* mtx, bool symmetrize, + std::unique_ptr>& factors, + std::unique_ptr>& forest); + /** * Computes the symbolic LU factorization of the given matrix. * From ef4b6f19d009288bc69059982246a72dd66c760c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 20 Dec 2024 20:24:00 +0100 Subject: [PATCH 11/26] extract elimination forest kernels into separate file --- common/cuda_hip/CMakeLists.txt | 1 + .../factorization/cholesky_kernels.cpp | 328 --------------- .../elimination_forest_kernels.cpp | 381 ++++++++++++++++++ core/device_hooks/common_kernels.inc.cpp | 14 +- core/factorization/cholesky.cpp | 5 +- core/factorization/cholesky_kernels.hpp | 18 - .../elimination_forest_kernels.hpp | 75 ++++ core/factorization/ic.cpp | 6 +- core/factorization/symbolic.cpp | 4 +- dpcpp/CMakeLists.txt | 1 + dpcpp/factorization/cholesky_kernels.dp.cpp | 19 - .../elimination_forest_kernels.dp.cpp | 46 +++ omp/CMakeLists.txt | 1 + omp/factorization/cholesky_kernels.cpp | 89 ---- .../elimination_forest_kernels.cpp | 118 ++++++ reference/CMakeLists.txt | 1 + reference/factorization/cholesky_kernels.cpp | 85 ---- .../elimination_forest_kernels.cpp | 116 ++++++ .../test/factorization/cholesky_kernels.cpp | 5 +- test/factorization/cholesky_kernels.cpp | 13 +- 20 files changed, 772 insertions(+), 554 deletions(-) create mode 100644 common/cuda_hip/factorization/elimination_forest_kernels.cpp create mode 100644 core/factorization/elimination_forest_kernels.hpp create mode 100644 dpcpp/factorization/elimination_forest_kernels.dp.cpp create mode 100644 omp/factorization/elimination_forest_kernels.cpp create mode 100644 reference/factorization/elimination_forest_kernels.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 5cfa55ca687..d7d1de30dea 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -12,6 +12,7 @@ set(CUDA_HIP_SOURCES distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp factorization/factorization_kernels.cpp + factorization/elimination_forest_kernels.cpp factorization/ic_kernels.cpp factorization/ilu_kernels.cpp factorization/lu_kernels.cpp diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 54fa3c8caa9..483144d8708 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -50,172 +50,6 @@ constexpr int default_block_size = 512; namespace kernel { -template -__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( - const IndexType* __restrict__ rows, const IndexType* __restrict__ cols, - IndexType size, IndexType* __restrict__ worklist_sources, - IndexType* __restrict__ worklist_targets, - IndexType* __restrict__ worklist_edge_ids, - IndexType* __restrict__ worklist_counter) -{ - using atomic_type = std::conditional_t, - int32, unsigned long long>; - const auto i = thread::get_thread_id_flat(); - if (i >= size) { - return; - } - const auto row = rows[i]; - const auto col = cols[i]; - if (col < row) { - const auto out_i = static_cast(atomicAdd( - reinterpret_cast(worklist_counter), atomic_type{1})); - worklist_sources[out_i] = row; - worklist_targets[out_i] = col; - worklist_edge_ids[out_i] = i; - } -} - - -template -__device__ IndexType mst_find(const IndexType* parents, IndexType node) -{ - auto parent = parents[node]; - while (parent != node) { - node = parent; - parent = parents[node]; - }; - return parent; -} - - -template -__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node) -{ - auto parent = load_relaxed_local(parents + node); - while (parent != node) { - node = parent; - parent = load_relaxed_local(parents + node); - }; - return parent; -} - - -template -__device__ void guarded_atomic_min(IndexType* ptr, IndexType value) -{ - using atomic_type = std::conditional_t, - int32, unsigned long long>; - // only execute the atomic if we know that it might have an effect - if (load_relaxed_local(ptr) > value) { - atomicMin(reinterpret_cast(ptr), - static_cast(value)); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void mst_find_minimum( - const IndexType* __restrict__ in_sources, - const IndexType* __restrict__ in_targets, - const IndexType* __restrict__ in_edge_ids, IndexType size, - const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge, - IndexType* __restrict__ worklist_sources, - IndexType* __restrict__ worklist_targets, - IndexType* __restrict__ worklist_edge_ids, - IndexType* __restrict__ worklist_counter) -{ - using atomic_type = std::conditional_t, - int32, unsigned long long>; - const auto i = thread::get_thread_id_flat(); - if (i >= size) { - return; - } - const auto source = in_sources[i]; - const auto target = in_targets[i]; - const auto edge_id = in_edge_ids[i]; - const auto source_rep = mst_find(parents, source); - const auto target_rep = mst_find(parents, target); - if (source_rep != target_rep) { - const auto out_i = static_cast(atomicAdd( - reinterpret_cast(worklist_counter), atomic_type{1})); - worklist_sources[out_i] = source_rep; - worklist_targets[out_i] = target_rep; - worklist_edge_ids[out_i] = edge_id; - guarded_atomic_min(min_edge + source_rep, edge_id); - guarded_atomic_min(min_edge + target_rep, edge_id); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void mst_join_edges( - const IndexType* __restrict__ in_sources, - const IndexType* __restrict__ in_targets, - const IndexType* __restrict__ in_edge_ids, IndexType size, - IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge, - const IndexType* __restrict__ edge_sources, - const IndexType* __restrict__ edge_targets, - IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets, - IndexType* __restrict__ out_counter) -{ - using atomic_type = std::conditional_t, - int32, unsigned long long>; - const auto i = thread::get_thread_id_flat(); - if (i >= size) { - return; - } - const auto source = in_sources[i]; - const auto target = in_targets[i]; - const auto edge_id = in_edge_ids[i]; - if (min_edge[source] == edge_id || min_edge[target] == edge_id) { - // join source and sink - const auto source_rep = mst_find_relaxed(parents, source); - const auto target_rep = mst_find_relaxed(parents, target); - assert(source_rep != target_rep); - const auto new_rep = min(source_rep, target_rep); - auto old_rep = max(source_rep, target_rep); - bool repeat = false; - do { - repeat = false; - auto old_parent = - atomicCAS(reinterpret_cast(parents + old_rep), - static_cast(old_rep), - static_cast(new_rep)); - // if this fails, the parent of old_rep changed recently, so we need - // to try again by updating the parent's parent (hopefully its rep) - if (old_parent != old_rep) { - old_rep = old_parent; - repeat = true; - } - } while (repeat); - const auto out_i = static_cast(atomicAdd( - reinterpret_cast(out_counter), atomic_type{1})); - out_sources[out_i] = edge_sources[edge_id]; - out_targets[out_i] = edge_targets[edge_id]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges( - const IndexType* __restrict__ in_sources, - const IndexType* __restrict__ in_targets, IndexType size, - IndexType* __restrict__ min_edge) -{ - constexpr auto sentinel = std::numeric_limits::max(); - const auto i = thread::get_thread_id_flat(); - if (i >= size) { - return; - } - const auto source = in_sources[i]; - const auto target = in_targets[i]; - // we could write the values non-atomically, but this makes race checkers - // happier without a performance penalty (hopefully, thanks to _local) - store_relaxed_local(min_edge + source, sentinel); - store_relaxed_local(min_edge + target, sentinel); -} - - template __global__ __launch_bounds__(default_block_size) void build_postorder_cols( IndexType num_rows, const IndexType* cols, const IndexType* row_ptrs, @@ -398,109 +232,6 @@ __global__ __launch_bounds__(default_block_size) void factorize( } // namespace kernel -template -void compute_skeleton_tree(std::shared_ptr exec, - const IndexType* row_ptrs, const IndexType* cols, - size_type size, IndexType* out_row_ptrs, - IndexType* out_cols) -{ - const auto policy = thrust_policy(exec); - const auto nnz = exec->copy_val_to_host(row_ptrs + size); - // convert edges to COO representation - // the edge list is sorted, since we only consider edges where row > col, - // and the row array (= weights) is sorted coming from row_ptrs - array row_array{exec, static_cast(nnz)}; - const auto rows = row_array.get_data(); - components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows); - // we assume the matrix is symmetric, so we can remove every second edge - // also round up the worklist size for equal cache alignment between fields - const auto worklist_size = - ceildiv(nnz, config::warp_size * 2) * config::warp_size; - // create 2 worklists consisting of (start, end, edge_id) - array worklist{exec, static_cast(worklist_size * 8)}; - auto wl1_source = worklist.get_data(); - auto wl1_target = wl1_source + worklist_size; - auto wl1_edge_id = wl1_target + worklist_size; - auto wl2_source = wl1_source + 3 * worklist_size; - auto wl2_target = wl1_target + 3 * worklist_size; - auto wl2_edge_id = wl1_edge_id + 3 * worklist_size; - // atomic counters for worklists and output edge list - array counters{exec, 3}; - auto wl1_counter = counters.get_data(); - auto wl2_counter = wl1_counter + 1; - auto output_counter = wl2_counter + 1; - components::fill_array(exec, counters.get_data(), counters.get_size(), - IndexType{}); - // helpers for interacting with worklists - const auto clear_wl1 = [&] { - IndexType value{}; - exec->copy_from(exec->get_master(), 1, &value, wl1_counter); - }; - const auto clear_wl2 = [&] { - IndexType value{}; - exec->copy_from(exec->get_master(), 1, &value, wl2_counter); - }; - const auto get_wl1_size = [&] { - return exec->copy_val_to_host(wl1_counter); - }; - const auto swap_wl1_wl2 = [&] { - std::swap(wl1_source, wl2_source); - std::swap(wl1_target, wl2_target); - std::swap(wl1_edge_id, wl2_edge_id); - std::swap(wl1_counter, wl2_counter); - }; - // initialize every node to a singleton set - array parent_array{exec, size}; - const auto parents = parent_array.get_data(); - components::fill_seq_array(exec, parents, size); - // array storing the minimal edge adjacent to each node - array min_edge_array{exec, size}; - const auto min_edges = min_edge_array.get_data(); - constexpr auto min_edge_sentinel = std::numeric_limits::max(); - components::fill_array(exec, min_edges, size, min_edge_sentinel); - // output row array, to be used in conjunction with out_cols in COO storage - array out_row_array{exec, size}; - const auto out_rows = out_row_array.get_data(); - // initialize worklist1 with forward edges - { - const auto num_blocks = ceildiv(nnz, default_block_size); - kernel::mst_initialize_worklist<<>>( - rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter); - } - auto wl1_size = get_wl1_size(); - while (wl1_size > 0) { - clear_wl2(); - // attach each node to its smallest adjacent non-cycle edge - { - const auto num_blocks = ceildiv(wl1_size, default_block_size); - kernel::mst_find_minimum<<>>( - wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, - min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter); - } - clear_wl1(); - swap_wl1_wl2(); - wl1_size = get_wl1_size(); - if (wl1_size > 0) { - // join minimal edges - const auto num_blocks = ceildiv(wl1_size, default_block_size); - kernel::mst_join_edges<<>>( - wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, - min_edges, rows, cols, out_rows, out_cols, output_counter); - kernel::mst_reset_min_edges<<>>( - wl1_source, wl1_target, wl1_size, min_edges); - } - } - const auto num_mst_edges = exec->copy_val_to_host(output_counter); - thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows); - thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges, - out_cols); - components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size, - out_row_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); - - template void symbolic_factorize( std::shared_ptr exec, @@ -534,65 +265,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void build_children_from_parents( - std::shared_ptr exec, - gko::factorization::elimination_forest& forest) -{ - const auto num_rows = forest.parents.get_size(); - // build COO representation of the tree - array col_idx_array{exec, num_rows}; - const auto col_idxs = col_idx_array.get_data(); - const auto parents = forest.parents.get_const_data(); - const auto children = forest.children.get_data(); - const auto child_ptrs = forest.child_ptrs.get_data(); - exec->copy(num_rows, parents, col_idxs); - thrust::sequence(thrust_policy(exec), children, children + num_rows, - IndexType{}); - // group by parent - thrust::stable_sort_by_key(thrust_policy(exec), col_idxs, - col_idxs + num_rows, children); - // create child pointers for groups of children - components::convert_idxs_to_ptrs(exec, col_idxs, num_rows, - num_rows + 1, // rows plus sentinel - child_ptrs); -} - - -template -void forest_from_factor( - std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& forest) -{ - const auto num_rows = factors->get_size()[0]; - const auto it = thrust::make_counting_iterator(IndexType{}); - thrust::transform( - thrust_policy(exec), it, it + num_rows, forest.parents.get_data(), - [row_ptrs = factors->get_const_row_ptrs(), - col_idxs = factors->get_const_col_idxs(), - num_rows] __device__(IndexType l_col) { - const auto llt_row_begin = row_ptrs[l_col]; - const auto llt_row_end = row_ptrs[l_col + 1]; - for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { - const auto l_row = col_idxs[nz]; - // parent[j] = min(i | i > j and l_ij =/= 0) - // we read from L^T stored above the diagonal in factors - // assuming a sorted order of the columns - if (l_row > l_col) { - return l_row; - } - } - // sentinel pseudo-root - return static_cast(num_rows); - }); - build_children_from_parents(exec, forest); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp new file mode 100644 index 00000000000..6291b7628bc --- /dev/null +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -0,0 +1,381 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/syncfree.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/cholesky_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace elimination_forest { + + +constexpr int default_block_size = 512; + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( + const IndexType* __restrict__ rows, const IndexType* __restrict__ cols, + IndexType size, IndexType* __restrict__ worklist_sources, + IndexType* __restrict__ worklist_targets, + IndexType* __restrict__ worklist_edge_ids, + IndexType* __restrict__ worklist_counter) +{ + using atomic_type = std::conditional_t, + int32, unsigned long long>; + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto row = rows[i]; + const auto col = cols[i]; + if (col < row) { + const auto out_i = static_cast(atomicAdd( + reinterpret_cast(worklist_counter), atomic_type{1})); + worklist_sources[out_i] = row; + worklist_targets[out_i] = col; + worklist_edge_ids[out_i] = i; + } +} + + +template +__device__ IndexType mst_find(const IndexType* parents, IndexType node) +{ + auto parent = parents[node]; + while (parent != node) { + node = parent; + parent = parents[node]; + }; + return parent; +} + + +template +__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node) +{ + auto parent = load_relaxed_local(parents + node); + while (parent != node) { + node = parent; + parent = load_relaxed_local(parents + node); + }; + return parent; +} + + +template +__device__ void guarded_atomic_min(IndexType* ptr, IndexType value) +{ + using atomic_type = std::conditional_t, + int32, unsigned long long>; + // only execute the atomic if we know that it might have an effect + if (load_relaxed_local(ptr) > value) { + atomicMin(reinterpret_cast(ptr), + static_cast(value)); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_find_minimum( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, + const IndexType* __restrict__ in_edge_ids, IndexType size, + const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge, + IndexType* __restrict__ worklist_sources, + IndexType* __restrict__ worklist_targets, + IndexType* __restrict__ worklist_edge_ids, + IndexType* __restrict__ worklist_counter) +{ + using atomic_type = std::conditional_t, + int32, unsigned long long>; + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + const auto edge_id = in_edge_ids[i]; + const auto source_rep = mst_find(parents, source); + const auto target_rep = mst_find(parents, target); + if (source_rep != target_rep) { + const auto out_i = static_cast(atomicAdd( + reinterpret_cast(worklist_counter), atomic_type{1})); + worklist_sources[out_i] = source_rep; + worklist_targets[out_i] = target_rep; + worklist_edge_ids[out_i] = edge_id; + guarded_atomic_min(min_edge + source_rep, edge_id); + guarded_atomic_min(min_edge + target_rep, edge_id); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_join_edges( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, + const IndexType* __restrict__ in_edge_ids, IndexType size, + IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge, + const IndexType* __restrict__ edge_sources, + const IndexType* __restrict__ edge_targets, + IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets, + IndexType* __restrict__ out_counter) +{ + using atomic_type = std::conditional_t, + int32, unsigned long long>; + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + const auto edge_id = in_edge_ids[i]; + if (min_edge[source] == edge_id || min_edge[target] == edge_id) { + // join source and sink + const auto source_rep = mst_find_relaxed(parents, source); + const auto target_rep = mst_find_relaxed(parents, target); + assert(source_rep != target_rep); + const auto new_rep = min(source_rep, target_rep); + auto old_rep = max(source_rep, target_rep); + bool repeat = false; + do { + repeat = false; + auto old_parent = + atomicCAS(reinterpret_cast(parents + old_rep), + static_cast(old_rep), + static_cast(new_rep)); + // if this fails, the parent of old_rep changed recently, so we need + // to try again by updating the parent's parent (hopefully its rep) + if (old_parent != old_rep) { + old_rep = old_parent; + repeat = true; + } + } while (repeat); + const auto out_i = static_cast(atomicAdd( + reinterpret_cast(out_counter), atomic_type{1})); + out_sources[out_i] = edge_sources[edge_id]; + out_targets[out_i] = edge_targets[edge_id]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, IndexType size, + IndexType* __restrict__ min_edge) +{ + constexpr auto sentinel = std::numeric_limits::max(); + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + // we could write the values non-atomically, but this makes race checkers + // happier without a performance penalty (hopefully, thanks to _local) + store_relaxed_local(min_edge + source, sentinel); + store_relaxed_local(min_edge + target, sentinel); +} + + +} // namespace kernel + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + const auto policy = thrust_policy(exec); + const auto nnz = exec->copy_val_to_host(row_ptrs + size); + // convert edges to COO representation + // the edge list is sorted, since we only consider edges where row > col, + // and the row array (= weights) is sorted coming from row_ptrs + array row_array{exec, static_cast(nnz)}; + const auto rows = row_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows); + // we assume the matrix is symmetric, so we can remove every second edge + // also round up the worklist size for equal cache alignment between fields + const auto worklist_size = + ceildiv(nnz, config::warp_size * 2) * config::warp_size; + // create 2 worklists consisting of (start, end, edge_id) + array worklist{exec, static_cast(worklist_size * 8)}; + auto wl1_source = worklist.get_data(); + auto wl1_target = wl1_source + worklist_size; + auto wl1_edge_id = wl1_target + worklist_size; + auto wl2_source = wl1_source + 3 * worklist_size; + auto wl2_target = wl1_target + 3 * worklist_size; + auto wl2_edge_id = wl1_edge_id + 3 * worklist_size; + // atomic counters for worklists and output edge list + array counters{exec, 3}; + auto wl1_counter = counters.get_data(); + auto wl2_counter = wl1_counter + 1; + auto output_counter = wl2_counter + 1; + components::fill_array(exec, counters.get_data(), counters.get_size(), + IndexType{}); + // helpers for interacting with worklists + const auto clear_wl1 = [&] { + IndexType value{}; + exec->copy_from(exec->get_master(), 1, &value, wl1_counter); + }; + const auto clear_wl2 = [&] { + IndexType value{}; + exec->copy_from(exec->get_master(), 1, &value, wl2_counter); + }; + const auto get_wl1_size = [&] { + return exec->copy_val_to_host(wl1_counter); + }; + const auto swap_wl1_wl2 = [&] { + std::swap(wl1_source, wl2_source); + std::swap(wl1_target, wl2_target); + std::swap(wl1_edge_id, wl2_edge_id); + std::swap(wl1_counter, wl2_counter); + }; + // initialize every node to a singleton set + array parent_array{exec, size}; + const auto parents = parent_array.get_data(); + components::fill_seq_array(exec, parents, size); + // array storing the minimal edge adjacent to each node + array min_edge_array{exec, size}; + const auto min_edges = min_edge_array.get_data(); + constexpr auto min_edge_sentinel = std::numeric_limits::max(); + components::fill_array(exec, min_edges, size, min_edge_sentinel); + // output row array, to be used in conjunction with out_cols in COO storage + array out_row_array{exec, size}; + const auto out_rows = out_row_array.get_data(); + // initialize worklist1 with forward edges + { + const auto num_blocks = ceildiv(nnz, default_block_size); + kernel::mst_initialize_worklist<<>>( + rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter); + } + auto wl1_size = get_wl1_size(); + while (wl1_size > 0) { + clear_wl2(); + // attach each node to its smallest adjacent non-cycle edge + { + const auto num_blocks = ceildiv(wl1_size, default_block_size); + kernel::mst_find_minimum<<>>( + wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, + min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter); + } + clear_wl1(); + swap_wl1_wl2(); + wl1_size = get_wl1_size(); + if (wl1_size > 0) { + // join minimal edges + const auto num_blocks = ceildiv(wl1_size, default_block_size); + kernel::mst_join_edges<<>>( + wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, + min_edges, rows, cols, out_rows, out_cols, output_counter); + kernel::mst_reset_min_edges<<>>( + wl1_source, wl1_target, wl1_size, min_edges); + } + } + const auto num_mst_edges = exec->copy_val_to_host(output_counter); + thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows); + thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges, + out_cols); + components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void build_children_from_parents( + std::shared_ptr exec, + gko::factorization::elimination_forest& forest) +{ + const auto num_rows = forest.parents.get_size(); + // build COO representation of the tree + array col_idx_array{exec, num_rows}; + const auto col_idxs = col_idx_array.get_data(); + const auto parents = forest.parents.get_const_data(); + const auto children = forest.children.get_data(); + const auto child_ptrs = forest.child_ptrs.get_data(); + exec->copy(num_rows, parents, col_idxs); + thrust::sequence(thrust_policy(exec), children, children + num_rows, + IndexType{}); + // group by parent + thrust::stable_sort_by_key(thrust_policy(exec), col_idxs, + col_idxs + num_rows, children); + // create child pointers for groups of children + components::convert_idxs_to_ptrs(exec, col_idxs, num_rows, + num_rows + 1, // rows plus sentinel + child_ptrs); +} + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) +{ + const auto num_rows = factors->get_size()[0]; + const auto it = thrust::make_counting_iterator(IndexType{}); + thrust::transform( + thrust_policy(exec), it, it + num_rows, forest.parents.get_data(), + [row_ptrs = factors->get_const_row_ptrs(), + col_idxs = factors->get_const_col_idxs(), + num_rows] __device__(IndexType l_col) { + const auto llt_row_begin = row_ptrs[l_col]; + const auto llt_row_end = row_ptrs[l_col + 1]; + for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { + const auto l_row = col_idxs[nz]; + // parent[j] = min(i | i > j and l_ij =/= 0) + // we read from L^T stored above the diagonal in factors + // assuming a sorted order of the columns + if (l_row > l_col) { + return l_row; + } + } + // sentinel pseudo-root + return static_cast(num_rows); + }); + build_children_from_parents(exec, forest); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 668a4e0f6bf..19c9917ed4b 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -23,6 +23,7 @@ #include "core/distributed/partition_kernels.hpp" #include "core/distributed/vector_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/ic_kernels.hpp" #include "core/factorization/ilu_kernels.hpp" @@ -927,10 +928,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); namespace cholesky { -GKO_STUB_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); @@ -938,6 +937,17 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); } // namespace cholesky +namespace elimination_forest { + + +GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES); +GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + +} // namespace elimination_forest + + namespace factorization { diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp index ce189e3bbdc..2f8d8630a1e 100644 --- a/core/factorization/cholesky.cpp +++ b/core/factorization/cholesky.cpp @@ -15,6 +15,7 @@ #include "core/config/config_helper.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -27,7 +28,7 @@ namespace { GKO_REGISTER_OPERATION(fill_array, components::fill_array); -GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor); +GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor); GKO_REGISTER_OPERATION(initialize, cholesky::initialize); GKO_REGISTER_OPERATION(factorize, cholesky::factorize); @@ -102,7 +103,7 @@ std::unique_ptr Cholesky::generate_impl( forest = std::make_unique>( exec, num_rows); - exec->run(make_forest_from_factor(factors.get(), *forest)); + exec->run(make_from_factor(factors.get(), *forest)); } // setup lookup structure on factors const auto lookup = matrix::csr::build_lookup(factors.get()); diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index 9a9a165e6b4..f80e057da3a 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -20,13 +20,6 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType) \ - void compute_skeleton_tree(std::shared_ptr exec, \ - const IndexType* row_ptrs, \ - const IndexType* cols, size_type size, \ - IndexType* out_row_ptrs, IndexType* out_cols) - - #define GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType) \ void symbolic_count( \ std::shared_ptr exec, \ @@ -44,13 +37,6 @@ namespace kernels { const array& tmp_storage) -#define GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType) \ - void forest_from_factor( \ - std::shared_ptr exec, \ - const matrix::Csr* factors, \ - gko::factorization::elimination_forest& forest) - - #define GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType) \ void initialize(std::shared_ptr exec, \ const matrix::Csr* mtx, \ @@ -73,15 +59,11 @@ namespace kernels { #define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType); \ template \ GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType); \ template \ GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE(ValueType, IndexType); \ template \ - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType); \ - template \ GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType); \ template \ GKO_DECLARE_CHOLESKY_FACTORIZE(ValueType, IndexType) diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp new file mode 100644 index 00000000000..8d76ee73b54 --- /dev/null +++ b/core/factorization/elimination_forest_kernels.hpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_ +#define GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_ + + +#include "core/factorization/elimination_forest.hpp" + +#include + +#include +#include +#include + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType) \ + void compute_skeleton_tree(std::shared_ptr exec, \ + const IndexType* row_ptrs, \ + const IndexType* cols, size_type size, \ + IndexType* out_row_ptrs, IndexType* out_cols) + + +#define GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType) \ + void from_factor( \ + std::shared_ptr exec, \ + const matrix::Csr* factors, \ + gko::factorization::elimination_forest& forest) + + +#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType) \ + void compute_subtree_sizes( \ + std::shared_ptr exec, \ + const gko::factorization::elimination_forest& forest, \ + IndexType* subtree_size) + + +#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType) \ + void compute_postorder( \ + std::shared_ptr exec, \ + const gko::factorization::elimination_forest& forest, \ + const IndexType* subtree_size, IndexType* postorder, \ + IndexType* inv_postorder) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType); \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType); \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType); \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(elimination_forest, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_ diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp index 46d45ba5f04..3eb14e05a4a 100644 --- a/core/factorization/ic.cpp +++ b/core/factorization/ic.cpp @@ -18,6 +18,7 @@ #include "core/config/config_helper.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/ic_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -37,7 +38,7 @@ GKO_REGISTER_OPERATION(initialize_row_ptrs_l, GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l); // for gko syncfree implementation GKO_REGISTER_OPERATION(fill_array, components::fill_array); -GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor); +GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor); GKO_REGISTER_OPERATION(initialize, cholesky::initialize); GKO_REGISTER_OPERATION(factorize, cholesky::factorize); @@ -119,8 +120,7 @@ std::unique_ptr> Ic::generate( forest = std::make_unique>( exec, num_rows); - exec->run( - ic_factorization::make_forest_from_factor(factors.get(), *forest)); + exec->run(ic_factorization::make_from_factor(factors.get(), *forest)); // setup lookup structure on factors const auto lookup = matrix::csr::build_lookup(factors.get()); diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index 3d59ec78ac0..9c7e1785396 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -15,6 +15,7 @@ #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -24,7 +25,8 @@ namespace factorization { namespace { -GKO_REGISTER_OPERATION(compute_skeleton_tree, cholesky::compute_skeleton_tree); +GKO_REGISTER_OPERATION(compute_skeleton_tree, + elimination_forest::compute_skeleton_tree); GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count); GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize); GKO_REGISTER_OPERATION(prefix_sum_nonnegative, diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 81a2a6034ea..a5fc043e171 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -27,6 +27,7 @@ target_sources(ginkgo_dpcpp distributed/partition_kernels.dp.cpp distributed/vector_kernels.dp.cpp factorization/cholesky_kernels.dp.cpp + factorization/elimination_forest_kernels.dp.cpp factorization/ic_kernels.dp.cpp factorization/ilu_kernels.dp.cpp factorization/factorization_kernels.dp.cpp diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp index 1461f918be7..50bdbceaa75 100644 --- a/dpcpp/factorization/cholesky_kernels.dp.cpp +++ b/dpcpp/factorization/cholesky_kernels.dp.cpp @@ -25,15 +25,6 @@ namespace dpcpp { namespace cholesky { -template -void compute_skeleton_tree(std::shared_ptr exec, - const IndexType* row_ptrs, const IndexType* cols, - size_type size, IndexType* out_row_ptrs, - IndexType* out_cols) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); - - template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, @@ -151,16 +142,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void forest_from_factor(std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& - forest) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp new file mode 100644 index 00000000000..4aecc0993f9 --- /dev/null +++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include + +#include + +#include + +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace climination_forest { + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace climination_forest +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 700a01116c9..31f369356f8 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -17,6 +17,7 @@ target_sources(ginkgo_omp distributed/partition_kernels.cpp distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp + factorization/elimination_forest_kernels.cpp factorization/factorization_kernels.cpp factorization/ic_kernels.cpp factorization/ilu_kernels.cpp diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp index 293bb70ff8b..76b1154512b 100644 --- a/omp/factorization/cholesky_kernels.cpp +++ b/omp/factorization/cholesky_kernels.cpp @@ -30,52 +30,6 @@ namespace omp { namespace cholesky { -template -void compute_skeleton_tree(std::shared_ptr exec, - const IndexType* row_ptrs, const IndexType* cols, - size_type size, IndexType* out_row_ptrs, - IndexType* out_cols) -{ - disjoint_sets sets(exec, size); - const auto nnz = static_cast(row_ptrs[size]); - vector> edges(nnz, exec); - // collect edge list - for (auto row : irange(static_cast(size))) { - for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { - const auto col = cols[nz]; - // edge contains (max, min) pair - auto edge = std::minmax(row, col, std::greater{}); - edges.push_back(edge); - } - } - // sort edge list ascending by edge weight - std::sort(edges.begin(), edges.end()); - // output helper array: Store row indices for output rows - // since we sorted by edge.first == row, this will be sorted - std::vector out_rows(size); - IndexType output_count{}; - // Kruskal algorithm: Connect unconnected components using edges with - // ascending weight - for (const auto edge : edges) { - const auto first_rep = sets.find(edge.first); - const auto second_rep = sets.find(edge.second); - if (first_rep != second_rep) { - // we are only interested in the lower triangle, so we add an edge - // max -> min - out_rows[output_count] = edge.first; - out_cols[output_count] = edge.second; - output_count++; - sets.join(first_rep, second_rep); - } - } - assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); - components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, - out_row_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); - - template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, @@ -178,49 +132,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void forest_from_factor( - std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& forest) -{ - const auto row_ptrs = factors->get_const_row_ptrs(); - const auto col_idxs = factors->get_const_col_idxs(); - const auto parents = forest.parents.get_data(); - const auto children = forest.children.get_data(); - const auto child_ptrs = forest.child_ptrs.get_data(); - const auto num_rows = static_cast(factors->get_size()[0]); - components::fill_array(exec, parents, num_rows, num_rows); -#pragma omp parallel for - for (IndexType l_col = 0; l_col < num_rows; l_col++) { - const auto llt_row_begin = row_ptrs[l_col]; - const auto llt_row_end = row_ptrs[l_col + 1]; - for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { - const auto l_row = col_idxs[nz]; - // parent[j] = min(i | i > j and l_ij =/= 0) - // we read from L^T stored above the diagonal in factors - // assuming a sorted order of the columns - if (l_row > l_col) { - parents[l_col] = l_row; - break; - } - } - } - // group by parent - array parents_copy{exec, static_cast(num_rows)}; - exec->copy(num_rows, parents, parents_copy.get_data()); - components::fill_seq_array(exec, children, num_rows); - const auto it = - detail::make_zip_iterator(parents_copy.get_data(), children); - std::stable_sort(it, it + num_rows); - components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(), - num_rows, num_rows + 1, child_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp new file mode 100644 index 00000000000..e150877e3a2 --- /dev/null +++ b/omp/factorization/elimination_forest_kernels.cpp @@ -0,0 +1,118 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include + +#include + +#include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" +#include "core/base/iterator_factory.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace elimination_forest { + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + disjoint_sets sets(exec, size); + const auto nnz = static_cast(row_ptrs[size]); + vector> edges(nnz, exec); + // collect edge list + for (auto row : irange(static_cast(size))) { + for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { + const auto col = cols[nz]; + // edge contains (max, min) pair + auto edge = std::minmax(row, col, std::greater{}); + edges.push_back(edge); + } + } + // sort edge list ascending by edge weight + std::sort(edges.begin(), edges.end()); + // output helper array: Store row indices for output rows + // since we sorted by edge.first == row, this will be sorted + std::vector out_rows(size); + IndexType output_count{}; + // Kruskal algorithm: Connect unconnected components using edges with + // ascending weight + for (const auto edge : edges) { + const auto first_rep = sets.find(edge.first); + const auto second_rep = sets.find(edge.second); + if (first_rep != second_rep) { + // we are only interested in the lower triangle, so we add an edge + // max -> min + out_rows[output_count] = edge.first; + out_cols[output_count] = edge.second; + output_count++; + sets.join(first_rep, second_rep); + } + } + assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); + components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) +{ + const auto row_ptrs = factors->get_const_row_ptrs(); + const auto col_idxs = factors->get_const_col_idxs(); + const auto parents = forest.parents.get_data(); + const auto children = forest.children.get_data(); + const auto child_ptrs = forest.child_ptrs.get_data(); + const auto num_rows = static_cast(factors->get_size()[0]); + components::fill_array(exec, parents, num_rows, num_rows); +#pragma omp parallel for + for (IndexType l_col = 0; l_col < num_rows; l_col++) { + const auto llt_row_begin = row_ptrs[l_col]; + const auto llt_row_end = row_ptrs[l_col + 1]; + for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { + const auto l_row = col_idxs[nz]; + // parent[j] = min(i | i > j and l_ij =/= 0) + // we read from L^T stored above the diagonal in factors + // assuming a sorted order of the columns + if (l_row > l_col) { + parents[l_col] = l_row; + break; + } + } + } + // group by parent + array parents_copy{exec, static_cast(num_rows)}; + exec->copy(num_rows, parents, parents_copy.get_data()); + components::fill_seq_array(exec, children, num_rows); + const auto it = + detail::make_zip_iterator(parents_copy.get_data(), children); + std::stable_sort(it, it + num_rows); + components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(), + num_rows, num_rows + 1, child_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 94e61d43d5c..27c04a4db09 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -19,6 +19,7 @@ target_sources(ginkgo_reference distributed/partition_kernels.cpp distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp + factorization/elimination_forest_kernels.cpp factorization/factorization_kernels.cpp factorization/ic_kernels.cpp factorization/ilu_kernels.cpp diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index 94d8903ca07..e489a3a2c57 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -32,53 +32,6 @@ namespace reference { namespace cholesky { -template -void compute_skeleton_tree(std::shared_ptr exec, - const IndexType* row_ptrs, const IndexType* cols, - size_type size, IndexType* out_row_ptrs, - IndexType* out_cols) -{ - disjoint_sets sets(exec, size); - const auto nnz = static_cast(row_ptrs[size]); - vector> edges(exec); - edges.reserve(nnz); - // collect edge list - for (auto row : irange(static_cast(size))) { - for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { - const auto col = cols[nz]; - // edge contains (max, min) pair - auto edge = std::minmax(row, col, std::greater{}); - edges.push_back(edge); - } - } - // sort edge list ascending by edge weight - std::sort(edges.begin(), edges.end()); - // output helper array: Store row indices for output rows - // since we sorted by edge.first == row, this will be sorted - std::vector out_rows(size); - IndexType output_count{}; - // Kruskal algorithm: Connect unconnected components using edges with - // ascending weight - for (const auto edge : edges) { - const auto first_rep = sets.find(edge.first); - const auto second_rep = sets.find(edge.second); - if (first_rep != second_rep) { - // we are only interested in the lower triangle, so we add an edge - // max -> min - out_rows[output_count] = edge.first; - out_cols[output_count] = edge.second; - output_count++; - sets.join(first_rep, second_rep); - } - } - assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); - components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, - out_row_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE); - - template void symbolic_count(std::shared_ptr exec, const matrix::Csr* mtx, @@ -154,44 +107,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void forest_from_factor( - std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& forest) -{ - const auto row_ptrs = factors->get_const_row_ptrs(); - const auto col_idxs = factors->get_const_col_idxs(); - const auto num_rows = static_cast(factors->get_size()[0]); - const auto parents = forest.parents.get_data(); - const auto children = forest.children.get_data(); - const auto child_ptrs = forest.child_ptrs.get_data(); - // filled with sentinel for unattached nodes - std::fill_n(parents, num_rows, num_rows); - for (IndexType row = 0; row < num_rows; row++) { - const auto row_begin = row_ptrs[row]; - const auto row_end = row_ptrs[row + 1]; - for (auto nz = row_begin; nz < row_end; nz++) { - const auto col = col_idxs[nz]; - // use the lower triangle, min row from column - if (col < row) { - parents[col] = std::min(parents[col], row); - } - } - } - // group by parent - vector parents_copy(parents, parents + num_rows, {exec}); - std::iota(children, children + num_rows, 0); - const auto it = detail::make_zip_iterator(parents_copy.begin(), children); - std::stable_sort(it, it + num_rows); - components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows, - num_rows + 1, child_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp new file mode 100644 index 00000000000..bd26900b78e --- /dev/null +++ b/reference/factorization/elimination_forest_kernels.cpp @@ -0,0 +1,116 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include +#include + +#include + +#include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" +#include "core/base/iterator_factory.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace elimination_forest { + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + disjoint_sets sets(exec, size); + const auto nnz = static_cast(row_ptrs[size]); + vector> edges(exec); + edges.reserve(nnz); + // collect edge list + for (auto row : irange(static_cast(size))) { + for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { + const auto col = cols[nz]; + // edge contains (max, min) pair + auto edge = std::minmax(row, col, std::greater{}); + edges.push_back(edge); + } + } + // sort edge list ascending by edge weight + std::sort(edges.begin(), edges.end()); + // output helper array: Store row indices for output rows + // since we sorted by edge.first == row, this will be sorted + std::vector out_rows(size); + IndexType output_count{}; + // Kruskal algorithm: Connect unconnected components using edges with + // ascending weight + for (const auto edge : edges) { + const auto first_rep = sets.find(edge.first); + const auto second_rep = sets.find(edge.second); + if (first_rep != second_rep) { + // we are only interested in the lower triangle, so we add an edge + // max -> min + out_rows[output_count] = edge.first; + out_cols[output_count] = edge.second; + output_count++; + sets.join(first_rep, second_rep); + } + } + assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); + components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) +{ + const auto row_ptrs = factors->get_const_row_ptrs(); + const auto col_idxs = factors->get_const_col_idxs(); + const auto num_rows = static_cast(factors->get_size()[0]); + const auto parents = forest.parents.get_data(); + const auto children = forest.children.get_data(); + const auto child_ptrs = forest.child_ptrs.get_data(); + // filled with sentinel for unattached nodes + std::fill_n(parents, num_rows, num_rows); + for (IndexType row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + // use the lower triangle, min row from column + if (col < row) { + parents[col] = std::min(parents[col], row); + } + } + } + // group by parent + vector parents_copy(parents, parents + num_rows, {exec}); + std::iota(children, children + num_rows, 0); + const auto it = detail::make_zip_iterator(parents_copy.begin(), children); + std::stable_sort(it, it + num_rows); + components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows, + num_rows + 1, child_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index cf72c13a0b1..9b47fcd6ad1 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -17,6 +17,7 @@ #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -248,7 +249,7 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) gko::factorization::compute_elim_forest(this->mtx.get(), this->forest); - gko::kernels::reference::cholesky::compute_skeleton_tree( + gko::kernels::reference::elimination_forest::compute_skeleton_tree( this->ref, this->mtx->get_const_row_ptrs(), this->mtx->get_const_col_idxs(), this->mtx->get_size()[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs()); @@ -360,7 +361,7 @@ TYPED_TEST(Cholesky, KernelForestFromFactorPlusPostprocessing) elimination_forest forest{this->ref, static_cast(this->num_rows)}; - gko::kernels::reference::cholesky::forest_from_factor( + gko::kernels::reference::elimination_forest::from_factor( this->ref, combined_factor.get(), forest); this->assert_equal_forests(forest, *forest_ref); diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index a08b71d0959..f4c4424c4e4 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -18,6 +18,7 @@ #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" @@ -171,12 +172,14 @@ TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree) const auto skeleton = matrix_type::create(this->ref, size, size[0]); const auto dskeleton = matrix_type::create(this->exec, size, size[0]); - gko::kernels::reference::cholesky::compute_skeleton_tree( + gko::kernels::reference::elimination_forest::compute_skeleton_tree( this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), size[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs()); - gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::compute_skeleton_tree( - this->exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - size[0], dskeleton->get_row_ptrs(), dskeleton->get_col_idxs()); + gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest:: + compute_skeleton_tree(this->exec, dmtx->get_const_row_ptrs(), + dmtx->get_const_col_idxs(), size[0], + dskeleton->get_row_ptrs(), + dskeleton->get_col_idxs()); // check that the created graphs are trees and have the same edge sum const auto weight_sum = this->check_mst(skeleton); @@ -307,7 +310,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks) elimination_forest dforest{this->exec, static_cast(mtx->get_size()[0])}; - gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest::from_factor( this->exec, dfactors.get(), dforest); this->assert_equal_forests(*forest, dforest); From 213ced61f7427920e538e7c4daec68fed460215b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 11 Jan 2025 14:38:15 +0100 Subject: [PATCH 12/26] update copyright years --- common/cuda_hip/components/memory.nvidia.hpp.inc | 2 +- common/cuda_hip/factorization/cholesky_kernels.cpp | 2 +- common/cuda_hip/factorization/elimination_forest_kernels.cpp | 2 +- core/factorization/cholesky_kernels.hpp | 2 +- core/factorization/elimination_forest_kernels.hpp | 2 +- core/factorization/symbolic.hpp | 2 +- core/test/utils/assertions.hpp | 2 +- dpcpp/factorization/cholesky_kernels.dp.cpp | 2 +- dpcpp/factorization/elimination_forest_kernels.dp.cpp | 2 +- omp/factorization/cholesky_kernels.cpp | 2 +- omp/factorization/elimination_forest_kernels.cpp | 2 +- reference/factorization/cholesky_kernels.cpp | 2 +- reference/factorization/elimination_forest_kernels.cpp | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index cf8d80c64e9..18ae23ffc38 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 483144d8708..ce4dc823b98 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp index 6291b7628bc..d1a5a2dc8d4 100644 --- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index f80e057da3a..fa87edd7fe3 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp index 8d76ee73b54..83484ccbffc 100644 --- a/core/factorization/elimination_forest_kernels.hpp +++ b/core/factorization/elimination_forest_kernels.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp index 84478e214e6..f3a6624a762 100644 --- a/core/factorization/symbolic.hpp +++ b/core/factorization/symbolic.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 13e5dfe0ea5..6b1014cdba0 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp index 50bdbceaa75..76a2f9d2b6a 100644 --- a/dpcpp/factorization/cholesky_kernels.dp.cpp +++ b/dpcpp/factorization/cholesky_kernels.dp.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp index 4aecc0993f9..95bd75e49e0 100644 --- a/dpcpp/factorization/elimination_forest_kernels.dp.cpp +++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp index 76b1154512b..826e06026da 100644 --- a/omp/factorization/cholesky_kernels.cpp +++ b/omp/factorization/cholesky_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp index e150877e3a2..e65f7852208 100644 --- a/omp/factorization/elimination_forest_kernels.cpp +++ b/omp/factorization/elimination_forest_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index e489a3a2c57..dea639f376f 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp index bd26900b78e..f0d46de5ca6 100644 --- a/reference/factorization/elimination_forest_kernels.cpp +++ b/reference/factorization/elimination_forest_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause From 7bb00ab48ea5494ffe669bf5604887d9f964ce9c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 31 Dec 2024 18:43:46 +0100 Subject: [PATCH 13/26] rename compute_elim_forest to compute_elimination_forest --- .../reference/sparse_blas.reordered.stdout | 2 +- core/factorization/elimination_forest.cpp | 47 ++++++++++--------- core/factorization/elimination_forest.hpp | 4 +- core/factorization/symbolic.cpp | 7 +-- .../test/factorization/elimination_forest.cpp | 12 ++--- .../test/factorization/cholesky_kernels.cpp | 23 ++++----- test/factorization/cholesky_kernels.cpp | 23 ++++----- 7 files changed, 61 insertions(+), 57 deletions(-) diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout index b5fc8998be0..f40bf8814dd 100644 --- a/benchmark/test/reference/sparse_blas.reordered.stdout +++ b/benchmark/test/reference/sparse_blas.reordered.stdout @@ -9,7 +9,7 @@ "bandwidth": 1.0, "repetitions": 10, "components": { - "compute_elim_forest": 1.0, + "compute_elimination_forest": 1.0, "allocate": 1.0, "free": 1.0, "components::fill_array": 1.0, diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp index 1dc8ff060a0..59d7f3638d2 100644 --- a/core/factorization/elimination_forest.cpp +++ b/core/factorization/elimination_forest.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -13,10 +13,9 @@ namespace { template -void compute_elim_forest_parent_impl(std::shared_ptr host_exec, - const IndexType* row_ptrs, - const IndexType* cols, IndexType num_rows, - IndexType* parent) +void compute_elimination_forest_parent_impl( + std::shared_ptr host_exec, const IndexType* row_ptrs, + const IndexType* cols, IndexType num_rows, IndexType* parent) { disjoint_sets subtrees{host_exec, num_rows}; array subtree_root_array{host_exec, @@ -50,8 +49,10 @@ void compute_elim_forest_parent_impl(std::shared_ptr host_exec, template -void compute_elim_forest_children_impl(const IndexType* parent, IndexType size, - IndexType* child_ptr, IndexType* child) +void compute_elimination_forest_children_impl(const IndexType* parent, + IndexType size, + IndexType* child_ptr, + IndexType* child) { // count how many times each parent occurs, excluding pseudo-root at // parent == size @@ -74,7 +75,7 @@ void compute_elim_forest_children_impl(const IndexType* parent, IndexType size, template -void compute_elim_forest_postorder_impl( +void compute_elimination_forest_postorder_impl( std::shared_ptr host_exec, const IndexType* parent, const IndexType* child_ptr, const IndexType* child, IndexType size, IndexType* postorder, IndexType* inv_postorder) @@ -111,10 +112,9 @@ void compute_elim_forest_postorder_impl( template -void compute_elim_forest_postorder_parent_impl(const IndexType* parent, - const IndexType* inv_postorder, - IndexType size, - IndexType* postorder_parent) +void compute_elimination_forest_postorder_parent_impl( + const IndexType* parent, const IndexType* inv_postorder, IndexType size, + IndexType* postorder_parent) { for (IndexType row = 0; row < size; row++) { postorder_parent[inv_postorder[row]] = @@ -140,26 +140,27 @@ void elimination_forest::set_executor( template -void compute_elim_forest(const matrix::Csr* mtx, - std::unique_ptr>& forest) +void compute_elimination_forest( + const matrix::Csr* mtx, + std::unique_ptr>& forest) { const auto host_exec = mtx->get_executor()->get_master(); const auto host_mtx = make_temporary_clone(host_exec, mtx); const auto num_rows = static_cast(host_mtx->get_size()[0]); forest = std::make_unique>(host_exec, num_rows); - compute_elim_forest_parent_impl(host_exec, host_mtx->get_const_row_ptrs(), - host_mtx->get_const_col_idxs(), num_rows, - forest->parents.get_data()); - compute_elim_forest_children_impl(forest->parents.get_const_data(), - num_rows, forest->child_ptrs.get_data(), - forest->children.get_data()); - compute_elim_forest_postorder_impl( + compute_elimination_forest_parent_impl( + host_exec, host_mtx->get_const_row_ptrs(), + host_mtx->get_const_col_idxs(), num_rows, forest->parents.get_data()); + compute_elimination_forest_children_impl( + forest->parents.get_const_data(), num_rows, + forest->child_ptrs.get_data(), forest->children.get_data()); + compute_elimination_forest_postorder_impl( host_exec, forest->parents.get_const_data(), forest->child_ptrs.get_const_data(), forest->children.get_const_data(), num_rows, forest->postorder.get_data(), forest->inv_postorder.get_data()); - compute_elim_forest_postorder_parent_impl( + compute_elimination_forest_postorder_parent_impl( forest->parents.get_const_data(), forest->inv_postorder.get_const_data(), num_rows, forest->postorder_parents.get_data()); @@ -169,7 +170,7 @@ void compute_elim_forest(const matrix::Csr* mtx, #define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \ - void compute_elim_forest( \ + void compute_elimination_forest( \ const matrix::Csr* mtx, \ std::unique_ptr>& forest) diff --git a/core/factorization/elimination_forest.hpp b/core/factorization/elimination_forest.hpp index 5307a90384c..50864ec6597 100644 --- a/core/factorization/elimination_forest.hpp +++ b/core/factorization/elimination_forest.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -41,7 +41,7 @@ struct elimination_forest { template -void compute_elim_forest( +void compute_elimination_forest( const matrix::Csr* mtx, std::unique_ptr>& forest); diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index 9c7e1785396..4d4033f519c 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -35,7 +35,8 @@ GKO_REGISTER_OPERATION(symbolic_factorize_simple, lu_factorization::symbolic_factorize_simple); GKO_REGISTER_OPERATION(symbolic_factorize_simple_finalize, lu_factorization::symbolic_factorize_simple_finalize); -GKO_REGISTER_HOST_OPERATION(compute_elim_forest, compute_elim_forest); +GKO_REGISTER_HOST_OPERATION(compute_elimination_forest, + compute_elimination_forest); } // namespace @@ -51,7 +52,7 @@ void symbolic_cholesky( GKO_ASSERT_IS_SQUARE_MATRIX(mtx); const auto exec = mtx->get_executor(); const auto host_exec = exec->get_master(); - exec->run(make_compute_elim_forest(mtx, forest)); + exec->run(make_compute_elimination_forest(mtx, forest)); const auto num_rows = mtx->get_size()[0]; array row_ptrs{exec, num_rows + 1}; array tmp{exec}; @@ -100,7 +101,7 @@ void symbolic_cholesky_device( exec->run(make_compute_skeleton_tree( mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), num_rows, skeleton->get_row_ptrs(), skeleton->get_col_idxs())); - exec->run(make_compute_elim_forest(skeleton.get(), forest)); + exec->run(make_compute_elimination_forest(skeleton.get(), forest)); } array row_ptrs{exec, num_rows + 1}; array tmp{exec}; diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp index 292b366f50e..9bbc9536c93 100644 --- a/core/test/factorization/elimination_forest.cpp +++ b/core/test/factorization/elimination_forest.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -55,7 +55,7 @@ TYPED_TEST(EliminationForest, WorksForExample) this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ(forest->parents, I({2, 4, 6, 8, 8, 6, 7, 8, 9, 10})); @@ -92,7 +92,7 @@ TYPED_TEST(EliminationForest, WorksForSeparable) this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ(forest->parents, I({2, 2, 10, 4, 5, 9, 7, 9, 9, 10})); @@ -128,7 +128,7 @@ TYPED_TEST(EliminationForest, WorksForPostOrderNotSelfInverse) }, this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ(forest->parents, I({2, 4, 6, 8, 5, 6, 7, 8, 9, 10})); GKO_ASSERT_ARRAY_EQ(forest->child_ptrs, @@ -152,7 +152,7 @@ TYPED_TEST(EliminationForest, WorksForAni1) auto mtx = gko::read(stream, this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); // the elimination tree is a path gko::array iota_arr{this->ref, 36}; @@ -178,7 +178,7 @@ TYPED_TEST(EliminationForest, WorksForAni1Amd) auto mtx = gko::read(stream, this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ( forest->parents, diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index 9b47fcd6ad1..17bb0a36426 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -92,7 +92,8 @@ class Cholesky : public ::testing::Test { l_factor_ref->get_num_stored_elements()); combined = matrix_type::create(ref, combined_ref->get_size(), combined_ref->get_num_stored_elements()); - gko::factorization::compute_elim_forest(l_factor_ref.get(), forest); + gko::factorization::compute_elimination_forest(l_factor_ref.get(), + forest); // init sparsity lookup ref->copy(num_rows + 1, l_factor_ref->get_const_row_ptrs(), l_factor->get_row_ptrs()); @@ -246,18 +247,18 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) auto skeleton = matrix_type::create( this->ref, this->mtx->get_size(), this->mtx->get_size()[0]); std::unique_ptr skeleton_forest; - gko::factorization::compute_elim_forest(this->mtx.get(), - this->forest); + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); gko::kernels::reference::elimination_forest::compute_skeleton_tree( this->ref, this->mtx->get_const_row_ptrs(), this->mtx->get_const_col_idxs(), this->mtx->get_size()[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs()); - gko::factorization::compute_elim_forest(this->mtx.get(), - this->forest); - gko::factorization::compute_elim_forest(skeleton.get(), - skeleton_forest); + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); + gko::factorization::compute_elimination_forest(skeleton.get(), + skeleton_forest); this->assert_equal_forests(*skeleton_forest, *this->forest); }, @@ -273,8 +274,8 @@ TYPED_TEST(Cholesky, KernelSymbolicCount) using index_type = typename TestFixture::index_type; this->forall_matrices( [this] { - gko::factorization::compute_elim_forest(this->mtx.get(), - this->forest); + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); gko::array row_nnz{this->ref, this->num_rows}; gko::kernels::reference::cholesky::symbolic_count( @@ -295,8 +296,8 @@ TYPED_TEST(Cholesky, KernelSymbolicFactorize) using index_type = typename TestFixture::index_type; this->forall_matrices( [this] { - gko::factorization::compute_elim_forest(this->mtx.get(), - this->forest); + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); gko::kernels::reference::cholesky::symbolic_count( this->ref, this->mtx.get(), *this->forest, this->l_factor->get_row_ptrs(), this->tmp); diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index f4c4424c4e4..ed422978086 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -190,11 +190,11 @@ TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree) std::unique_ptr forest; std::unique_ptr skeleton_forest; std::unique_ptr dskeleton_forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - gko::factorization::compute_elim_forest(skeleton.get(), - skeleton_forest); - gko::factorization::compute_elim_forest(dskeleton.get(), - dskeleton_forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(skeleton.get(), + skeleton_forest); + gko::factorization::compute_elimination_forest(dskeleton.get(), + dskeleton_forest); // the parents array fully determines the elimination forest GKO_ASSERT_ARRAY_EQ(forest->parents, skeleton_forest->parents); GKO_ASSERT_ARRAY_EQ(skeleton_forest->parents, @@ -214,8 +214,8 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount) const auto dmtx = gko::clone(this->exec, mtx); std::unique_ptr forest; std::unique_ptr dforest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - gko::factorization::compute_elim_forest(dmtx.get(), dforest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(dmtx.get(), dforest); gko::array row_nnz{this->ref, mtx->get_size()[0]}; gko::array drow_nnz{this->exec, mtx->get_size()[0]}; @@ -241,7 +241,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize) const auto dmtx = gko::clone(this->exec, mtx); const auto num_rows = mtx->get_size()[0]; std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); gko::array row_ptrs{this->ref, num_rows + 1}; gko::kernels::reference::cholesky::symbolic_count( this->ref, mtx.get(), *forest, row_ptrs.get_data(), this->tmp); @@ -258,7 +258,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize) gko::array{this->exec, nnz}, row_ptrs); // need to call the device kernels to initialize dtmp std::unique_ptr dforest; - gko::factorization::compute_elim_forest(dmtx.get(), dforest); + gko::factorization::compute_elimination_forest(dmtx.get(), dforest); gko::array dtmp_ptrs{this->exec, num_rows + 1}; gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count( this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp); @@ -358,8 +358,9 @@ class Cholesky : public CommonTestFixture { mtx_chol_sparsity->copy_from(mtx_chol.get()); dmtx_chol_sparsity = sparsity_pattern_type::create(exec); dmtx_chol_sparsity->copy_from(mtx_chol_sparsity.get()); - gko::factorization::compute_elim_forest(mtx_chol.get(), forest); - gko::factorization::compute_elim_forest(dmtx_chol.get(), dforest); + gko::factorization::compute_elimination_forest(mtx_chol.get(), forest); + gko::factorization::compute_elimination_forest(dmtx_chol.get(), + dforest); } void forall_matrices(std::function fn) From 6bff427e0ac1c9469311a0c5f47e3593f95b469e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 10:24:07 +0100 Subject: [PATCH 14/26] consider only lower triangular entries for skeleton tree input Co-authored-by: Marcel Koch --- reference/factorization/elimination_forest_kernels.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp index f0d46de5ca6..7fc0df8c4ec 100644 --- a/reference/factorization/elimination_forest_kernels.cpp +++ b/reference/factorization/elimination_forest_kernels.cpp @@ -39,9 +39,11 @@ void compute_skeleton_tree(std::shared_ptr exec, for (auto row : irange(static_cast(size))) { for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { const auto col = cols[nz]; + if (col >= row) { + continue; + } // edge contains (max, min) pair - auto edge = std::minmax(row, col, std::greater{}); - edges.push_back(edge); + edges.emplace_back(row, col); } } // sort edge list ascending by edge weight From 5aa16c8c47b527c17e69395ed346b13390be5674 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 10:34:58 +0100 Subject: [PATCH 15/26] fix memory script --- dev_tools/scripts/generate_cuda_memory_ptx.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index ed6a6cb32cf..265263f130f 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -62,7 +62,7 @@ class operation: operation(fn_op_suffix="_and", ptx_op_suffix=".and", supports_float=False, supports_signed=False), operation(fn_op_suffix="_or", ptx_op_suffix=".or", supports_float=False, supports_signed=False)] # header -print("""// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +print("""// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -289,7 +289,8 @@ class operation: for s in memory_spaces_without_shared: o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", - ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True) + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", + ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True) const_ptr_expr = s.ptr_expr.format( ptr=f"const_cast*>(ptr)") mut_ptr_expr = s.ptr_expr.format(ptr="ptr") From 941ceb7ec83976ff02bbf4be4d3ce99331cd3ff0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 10:36:18 +0100 Subject: [PATCH 16/26] remove unnecessary tabs/newlines from generation script --- dev_tools/scripts/generate_cuda_memory_ptx.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 265263f130f..f5c995a76a2 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -253,14 +253,14 @@ class operation: __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ {t.parent_name} result; - asm volatile("{{\\n\\t" - " .reg {t.ptx_type_suffix} t;\\n\\t" + asm volatile("{{" + " .reg {t.ptx_type_suffix} t;" #if __CUDA_ARCH__ < 700 - " ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + " ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];" #else - " ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + " ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];" #endif - " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;" "}}" : "={t.val_constraint}"(result) : "{s.ptr_constraint}"({const_ptr_expr}) @@ -273,13 +273,13 @@ class operation: __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) {{ {membar_expression} - asm volatile("{{\\n\\t" - " .reg {t.ptx_type_suffix} t;\\n\\t" - " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t" + asm volatile("{{" + " .reg {t.ptx_type_suffix} t;" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + " st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;" #else - " st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + " st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;" #endif "}}" :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result)) @@ -299,15 +299,15 @@ class operation: {{ {t.parent_name} real_result; {t.parent_name} imag_result; - asm volatile("{{\\n\\t" - " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" + asm volatile("{{" + " .reg .v2 {t.ptx_type_suffix} t;" #if __CUDA_ARCH__ < 700 - "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t" + "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];" #else - "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t" + "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];" #endif - " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t" - " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;" "}}" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) : "{s.ptr_constraint}"({const_ptr_expr}) @@ -320,14 +320,14 @@ class operation: {{ auto real_result = static_cast<{t.parent_name}>(result.real()); auto imag_result = static_cast<{t.parent_name}>(result.imag()); - asm volatile("{{\\n\\t" - " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" - " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t" - " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t" + asm volatile("{{" + " .reg .v2 {t.ptx_type_suffix} t;" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;" #if __CUDA_ARCH__ < 700 - "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;" #else - "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;" #endif "}}" :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) From 418714b5ec2f56bcc7456fc3bb768bf377b30fc3 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 10:36:43 +0100 Subject: [PATCH 17/26] rebuild memory.nvidia.hpp.inc --- .../cuda_hip/components/memory.nvidia.hpp.inc | 128 +++++++++--------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index 18ae23ffc38..b9cef35ac77 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -2423,14 +2423,14 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.relaxed.cta.b16 t, [%1];\n\t" + " ld.relaxed.cta.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -2443,13 +2443,13 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr) __device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result) { asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.relaxed.cta.b16 [%0], t;\n\t" + " st.relaxed.cta.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -2461,14 +2461,14 @@ __device__ __forceinline__ __half load_acquire_local(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.acquire.cta.b16 t, [%1];\n\t" + " ld.acquire.cta.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -2482,13 +2482,13 @@ __device__ __forceinline__ void store_release_local(__half* ptr, __half result) { membar_acq_rel_local(); asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.release.cta.b16 [%0], t;\n\t" + " st.release.cta.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -2500,14 +2500,14 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.relaxed.gpu.b16 t, [%1];\n\t" + " ld.relaxed.gpu.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -2520,13 +2520,13 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr) __device__ __forceinline__ void store_relaxed(__half* ptr, __half result) { asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.relaxed.gpu.b16 [%0], t;\n\t" + " st.relaxed.gpu.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -2538,14 +2538,14 @@ __device__ __forceinline__ __half load_acquire(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.acquire.gpu.b16 t, [%1];\n\t" + " ld.acquire.gpu.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -2559,13 +2559,13 @@ __device__ __forceinline__ void store_release(__half* ptr, __half result) { membar_acq_rel(); asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.release.gpu.b16 [%0], t;\n\t" + " st.release.gpu.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -2579,15 +2579,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed_local( float real_result; float imag_result; asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" + "{" + " .reg .v2 .f16 t;" #if __CUDA_ARCH__ < 700 - "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.volatile.v2.b16 {t.x, t.y}, [%2];" #else - "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];" #endif - " cvt.f32.f16 %0, t.x;\n\t" - " cvt.f32.f16 %1, t.y;\n\t" + " cvt.f32.f16 %0, t.x;" + " cvt.f32.f16 %1, t.y;" "}" : "=f"(real_result), "=f"(imag_result) : "l"(const_cast*>(ptr)) @@ -2602,14 +2602,14 @@ __device__ __forceinline__ void store_relaxed_local( auto real_result = static_cast(result.real()); auto imag_result = static_cast(result.imag()); asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" - " cvt.rn.f16.f32 t.x, %1;\n\t" - " cvt.rn.f16.f32 t.y, %2;\n\t" + "{" + " .reg .v2 .f16 t;" + " cvt.rn.f16.f32 t.x, %1;" + " cvt.rn.f16.f32 t.y, %2;" #if __CUDA_ARCH__ < 700 - "st.volatile.v2.b16 [%0], t;\n\t" + "st.volatile.v2.b16 [%0], t;" #else - "st.relaxed.cta.v2.b16 [%0], t;\n\t" + "st.relaxed.cta.v2.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(real_result), "f"(imag_result) @@ -2623,15 +2623,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed( float real_result; float imag_result; asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" + "{" + " .reg .v2 .f16 t;" #if __CUDA_ARCH__ < 700 - "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.volatile.v2.b16 {t.x, t.y}, [%2];" #else - "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];" #endif - " cvt.f32.f16 %0, t.x;\n\t" - " cvt.f32.f16 %1, t.y;\n\t" + " cvt.f32.f16 %0, t.x;" + " cvt.f32.f16 %1, t.y;" "}" : "=f"(real_result), "=f"(imag_result) : "l"(const_cast*>(ptr)) @@ -2646,14 +2646,14 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr, auto real_result = static_cast(result.real()); auto imag_result = static_cast(result.imag()); asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" - " cvt.rn.f16.f32 t.x, %1;\n\t" - " cvt.rn.f16.f32 t.y, %2;\n\t" + "{" + " .reg .v2 .f16 t;" + " cvt.rn.f16.f32 t.x, %1;" + " cvt.rn.f16.f32 t.y, %2;" #if __CUDA_ARCH__ < 700 - "st.volatile.v2.b16 [%0], t;\n\t" + "st.volatile.v2.b16 [%0], t;" #else - "st.relaxed.gpu.v2.b16 [%0], t;\n\t" + "st.relaxed.gpu.v2.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(real_result), "f"(imag_result) From bfd2598cc81da2884434d382ca337bb6e4c34e80 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 10:53:38 +0100 Subject: [PATCH 18/26] review updates Co-authored-by: Marcel Koch Co-authored-by: Yu-Hsiang M. Tsai --- benchmark/sparse_blas/operations.cpp | 4 +-- .../elimination_forest_kernels.cpp | 6 +--- core/device_hooks/common_kernels.inc.cpp | 2 -- core/factorization/elimination_forest.cpp | 9 +++--- .../elimination_forest_kernels.hpp | 29 ++++--------------- .../elimination_forest_kernels.cpp | 11 ++++--- reference/factorization/cholesky_kernels.cpp | 3 -- .../elimination_forest_kernels.cpp | 6 ++-- .../test/factorization/cholesky_kernels.cpp | 7 ++--- test/factorization/cholesky_kernels.cpp | 2 -- 10 files changed, 25 insertions(+), 54 deletions(-) diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index 1b4e5819793..50c86c19bcf 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -616,7 +616,7 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { public: explicit SymbolicCholeskyOperation(const Mtx* mtx, bool device, bool symmetric) - : mtx_{mtx}, symmetric_{symmetric}, device_{device}, result_{} + : mtx_{mtx}, device_{device}, symmetric_{symmetric}, result_{} {} std::pair validate() const override @@ -660,8 +660,8 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { private: const Mtx* mtx_; - bool symmetric_; bool device_; + bool symmetric_; std::unique_ptr result_; std::unique_ptr> forest_; }; diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp index d1a5a2dc8d4..41770e29a65 100644 --- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -18,17 +18,13 @@ #include #include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/sparselib_bindings.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/memory.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/syncfree.hpp" #include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" -#include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest_kernels.hpp" @@ -232,7 +228,7 @@ void compute_skeleton_tree(std::shared_ptr exec, const auto worklist_size = ceildiv(nnz, config::warp_size * 2) * config::warp_size; // create 2 worklists consisting of (start, end, edge_id) - array worklist{exec, static_cast(worklist_size * 8)}; + array worklist{exec, static_cast(worklist_size * 6)}; auto wl1_source = worklist.get_data(); auto wl1_target = wl1_source + worklist_size; auto wl1_edge_id = wl1_target + worklist_size; diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 19c9917ed4b..437e1145c40 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -941,8 +941,6 @@ namespace elimination_forest { GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); -GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES); -GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); } // namespace elimination_forest diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp index 59d7f3638d2..89bd9505f6d 100644 --- a/core/factorization/elimination_forest.cpp +++ b/core/factorization/elimination_forest.cpp @@ -169,12 +169,13 @@ void compute_elimination_forest( } -#define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \ - void compute_elimination_forest( \ - const matrix::Csr* mtx, \ +#define GKO_DECLARE_COMPUTE_ELIMINATION_FOREST(ValueType, IndexType) \ + void compute_elimination_forest( \ + const matrix::Csr* mtx, \ std::unique_ptr>& forest) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COMPUTE_ELIMINATION_FOREST); } // namespace factorization diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp index 83484ccbffc..39435846119 100644 --- a/core/factorization/elimination_forest_kernels.hpp +++ b/core/factorization/elimination_forest_kernels.hpp @@ -35,30 +35,11 @@ namespace kernels { gko::factorization::elimination_forest& forest) -#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType) \ - void compute_subtree_sizes( \ - std::shared_ptr exec, \ - const gko::factorization::elimination_forest& forest, \ - IndexType* subtree_size) - - -#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType) \ - void compute_postorder( \ - std::shared_ptr exec, \ - const gko::factorization::elimination_forest& forest, \ - const IndexType* subtree_size, IndexType* postorder, \ - IndexType* inv_postorder) - - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - template \ - GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType); \ - template \ - GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType); \ - template \ - GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType); \ - template \ - GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType) +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType); \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(elimination_forest, diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp index e65f7852208..57bd9a04530 100644 --- a/omp/factorization/elimination_forest_kernels.cpp +++ b/omp/factorization/elimination_forest_kernels.cpp @@ -31,21 +31,24 @@ void compute_skeleton_tree(std::shared_ptr exec, { disjoint_sets sets(exec, size); const auto nnz = static_cast(row_ptrs[size]); - vector> edges(nnz, exec); + vector> edges(exec); + edges.reserve(nnz / 2); // collect edge list for (auto row : irange(static_cast(size))) { for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { const auto col = cols[nz]; + if (col >= row) { + continue; + } // edge contains (max, min) pair - auto edge = std::minmax(row, col, std::greater{}); - edges.push_back(edge); + edges.emplace_back(row, col); } } // sort edge list ascending by edge weight std::sort(edges.begin(), edges.end()); // output helper array: Store row indices for output rows // since we sorted by edge.first == row, this will be sorted - std::vector out_rows(size); + vector out_rows(size, exec); IndexType output_count{}; // Kruskal algorithm: Connect unconnected components using edges with // ascending weight diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index dea639f376f..80b6c53663b 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -6,16 +6,13 @@ #include #include -#include #include #include "core/base/allocator.hpp" -#include "core/base/index_range.hpp" #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp index 7fc0df8c4ec..a86aac7f281 100644 --- a/reference/factorization/elimination_forest_kernels.cpp +++ b/reference/factorization/elimination_forest_kernels.cpp @@ -13,9 +13,7 @@ #include "core/base/allocator.hpp" #include "core/base/index_range.hpp" #include "core/base/iterator_factory.hpp" -#include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest_kernels.hpp" @@ -34,7 +32,7 @@ void compute_skeleton_tree(std::shared_ptr exec, disjoint_sets sets(exec, size); const auto nnz = static_cast(row_ptrs[size]); vector> edges(exec); - edges.reserve(nnz); + edges.reserve(nnz / 2); // collect edge list for (auto row : irange(static_cast(size))) { for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { @@ -50,7 +48,7 @@ void compute_skeleton_tree(std::shared_ptr exec, std::sort(edges.begin(), edges.end()); // output helper array: Store row indices for output rows // since we sorted by edge.first == row, this will be sorted - std::vector out_rows(size); + vector out_rows(size, exec); IndexType output_count{}; // Kruskal algorithm: Connect unconnected components using edges with // ascending weight diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index 17bb0a36426..1227c7e5e3d 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -142,6 +142,9 @@ class Cholesky : public ::testing::Test { fn(); } { + // structurally this is the example from Liu 1990 + // "The Role of Elimination Trees in Sparse Factorization" + // https://doi.org/10.1137/0611010. SCOPED_TRACE("example"); this->setup( {{4, 0, 1, 0, 0, 0, 0, 1, 0, 0}, @@ -238,7 +241,6 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes, TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) { using matrix_type = typename TestFixture::matrix_type; - using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; @@ -249,7 +251,6 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) std::unique_ptr skeleton_forest; gko::factorization::compute_elimination_forest(this->mtx.get(), this->forest); - gko::kernels::reference::elimination_forest::compute_skeleton_tree( this->ref, this->mtx->get_const_row_ptrs(), this->mtx->get_const_col_idxs(), this->mtx->get_size()[0], @@ -269,7 +270,6 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) TYPED_TEST(Cholesky, KernelSymbolicCount) { using matrix_type = typename TestFixture::matrix_type; - using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; using index_type = typename TestFixture::index_type; this->forall_matrices( @@ -291,7 +291,6 @@ TYPED_TEST(Cholesky, KernelSymbolicCount) TYPED_TEST(Cholesky, KernelSymbolicFactorize) { using matrix_type = typename TestFixture::matrix_type; - using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; using index_type = typename TestFixture::index_type; this->forall_matrices( diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index ed422978086..e2696f11d89 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -50,7 +50,6 @@ class CholeskySymbolic : public CommonTestFixture { {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 1, 0, 1}}, ref)); // this is the example from Liu 1990 https://doi.org/10.1137/0611010. // "The Role of Elimination Trees in Sparse Factorization" - matrices.emplace_back("example", gko::initialize( {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0}, {0, 1, 0, 0, 1, 0, 0, 0, 0, 1}, @@ -159,7 +158,6 @@ TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator); TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree) { using matrix_type = typename TestFixture::matrix_type; - using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; using elimination_forest = typename TestFixture::elimination_forest; for (const auto& pair : this->matrices) { From 8656cdf95df90ce1677e8043c0807ef43c564a6b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 10:57:41 +0100 Subject: [PATCH 19/26] remove unnecessary host_exec --- core/factorization/symbolic.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index 4d4033f519c..92d0bf663cd 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -51,7 +51,6 @@ void symbolic_cholesky( using matrix_type = matrix::Csr; GKO_ASSERT_IS_SQUARE_MATRIX(mtx); const auto exec = mtx->get_executor(); - const auto host_exec = exec->get_master(); exec->run(make_compute_elimination_forest(mtx, forest)); const auto num_rows = mtx->get_size()[0]; array row_ptrs{exec, num_rows + 1}; @@ -94,7 +93,6 @@ void symbolic_cholesky_device( GKO_ASSERT_IS_SQUARE_MATRIX(mtx); const auto exec = mtx->get_executor(); const auto num_rows = mtx->get_size()[0]; - const auto host_exec = exec->get_master(); { const auto skeleton = matrix_type::create(exec, mtx->get_size(), num_rows); From aca0fdc44cf6748de2c6aadae6132dd47bcb3171 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 14:05:30 +0100 Subject: [PATCH 20/26] clarify sortedness in MST algorithm --- omp/factorization/elimination_forest_kernels.cpp | 8 +++++--- reference/factorization/elimination_forest_kernels.cpp | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp index 57bd9a04530..f3f180a515d 100644 --- a/omp/factorization/elimination_forest_kernels.cpp +++ b/omp/factorization/elimination_forest_kernels.cpp @@ -44,10 +44,12 @@ void compute_skeleton_tree(std::shared_ptr exec, edges.emplace_back(row, col); } } - // sort edge list ascending by edge weight - std::sort(edges.begin(), edges.end()); + // the edge list is now sorted by row, which also matches the edge weight + // we don't need to do any additional sorting operations + assert(std::is_sorted(edges.begin(), edges.end(), + [](auto a, auto b) { return a.first < b.first; })); // output helper array: Store row indices for output rows - // since we sorted by edge.first == row, this will be sorted + // since the input is sorted by edge.first == row, this will be sorted vector out_rows(size, exec); IndexType output_count{}; // Kruskal algorithm: Connect unconnected components using edges with diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp index a86aac7f281..dbf00d02d72 100644 --- a/reference/factorization/elimination_forest_kernels.cpp +++ b/reference/factorization/elimination_forest_kernels.cpp @@ -44,10 +44,12 @@ void compute_skeleton_tree(std::shared_ptr exec, edges.emplace_back(row, col); } } - // sort edge list ascending by edge weight - std::sort(edges.begin(), edges.end()); + // the edge list is now sorted by row, which also matches the edge weight + // we don't need to do any additional sorting operations + assert(std::is_sorted(edges.begin(), edges.end(), + [](auto a, auto b) { return a.first < b.first; })); // output helper array: Store row indices for output rows - // since we sorted by edge.first == row, this will be sorted + // since the input is sorted by edge.first == row, this will be sorted vector out_rows(size, exec); IndexType output_count{}; // Kruskal algorithm: Connect unconnected components using edges with From dbbdd5ef70a71443fc9506ec45b69f1404f87dbc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 14:06:47 +0100 Subject: [PATCH 21/26] remove unnecessary type alias --- reference/test/factorization/cholesky_kernels.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index 1227c7e5e3d..0638f8ca931 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -242,7 +242,6 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) { using matrix_type = typename TestFixture::matrix_type; using elimination_forest = typename TestFixture::elimination_forest; - using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; this->forall_matrices( [this] { From 5c7c56541788b4ee75bb3c5752991f402aa6908a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 14:14:08 +0100 Subject: [PATCH 22/26] remove dependency on core for disjoint_sets --- core/components/disjoint_sets.hpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/core/components/disjoint_sets.hpp b/core/components/disjoint_sets.hpp index 601817e69d3..77a9177d172 100644 --- a/core/components/disjoint_sets.hpp +++ b/core/components/disjoint_sets.hpp @@ -1,12 +1,11 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause #ifndef GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_ #define GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_ - -#include +#include "core/base/allocator.hpp" namespace gko { @@ -31,10 +30,8 @@ class disjoint_sets { * allocate storage. */ explicit disjoint_sets(std::shared_ptr exec, IndexType size) - : parents_{exec->get_master(), static_cast(size)} - { - parents_.fill(-1); - } + : parents_{static_cast(size), -1, exec->get_master()} + {} /** * Returns true if and only if the element x is the representative of its @@ -138,17 +135,14 @@ class disjoint_sets { * * @return the number of elements represented in this data structure. */ - IndexType get_size() const { return parents_.get_size(); } + IndexType get_size() const { return parents_.size(); } private: - const IndexType& parent(IndexType x) const - { - return parents_.get_const_data()[x]; - } + const IndexType& parent(IndexType x) const { return parents_[x]; } - IndexType& parent(IndexType x) { return parents_.get_data()[x]; } + IndexType& parent(IndexType x) { return parents_[x]; } - array parents_; + vector parents_; }; From fb344a470462ea3e645b927e90becec4271b98de Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 14:35:26 +0100 Subject: [PATCH 23/26] add literature reference --- .../cuda_hip/factorization/elimination_forest_kernels.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp index 41770e29a65..069d448ea37 100644 --- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -215,6 +215,13 @@ void compute_skeleton_tree(std::shared_ptr exec, size_type size, IndexType* out_row_ptrs, IndexType* out_cols) { + // This is a minimum spanning tree algorithm implementation based on + // A. Fallin, A. Gonzalez, J. Seo, and M. Burtscher, + // "A High-Performance MST Implementation for GPUs,” + // doi: 10.1145/3581784.3607093 + // we don't filter heavy edges since the heaviest edges are necessary to + // reach the last node and we don't need to sort since the COO format + // already sorts by row index. const auto policy = thrust_policy(exec); const auto nnz = exec->copy_val_to_host(row_ptrs + size); // convert edges to COO representation From 706d58472225126d4079a841d6f1a8278ba92984 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 15 Jan 2025 15:40:58 +0100 Subject: [PATCH 24/26] fix dpcpp compilation --- dpcpp/factorization/elimination_forest_kernels.dp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp index 95bd75e49e0..01310b5c448 100644 --- a/dpcpp/factorization/elimination_forest_kernels.dp.cpp +++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp @@ -17,7 +17,7 @@ namespace gko { namespace kernels { namespace dpcpp { -namespace climination_forest { +namespace elimination_forest { template @@ -40,7 +40,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); -} // namespace climination_forest +} // namespace elimination_forest } // namespace dpcpp } // namespace kernels } // namespace gko From 0debdc6c6e230ba67a805dd5905aadb2f422e005 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 16 Jan 2025 11:28:58 +0100 Subject: [PATCH 25/26] use atomic wrappers --- .../elimination_forest_kernels.cpp | 19 ++++---------- hip/components/memory.hip.hpp | 26 ++++++++++++++++++- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp index 069d448ea37..553032dcf2c 100644 --- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -48,8 +48,6 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( IndexType* __restrict__ worklist_edge_ids, IndexType* __restrict__ worklist_counter) { - using atomic_type = std::conditional_t, - int32, unsigned long long>; const auto i = thread::get_thread_id_flat(); if (i >= size) { return; @@ -57,8 +55,7 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( const auto row = rows[i]; const auto col = cols[i]; if (col < row) { - const auto out_i = static_cast(atomicAdd( - reinterpret_cast(worklist_counter), atomic_type{1})); + const auto out_i = atomic_add_relaxed(worklist_counter, 1); worklist_sources[out_i] = row; worklist_targets[out_i] = col; worklist_edge_ids[out_i] = i; @@ -93,12 +90,9 @@ __device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node) template __device__ void guarded_atomic_min(IndexType* ptr, IndexType value) { - using atomic_type = std::conditional_t, - int32, unsigned long long>; // only execute the atomic if we know that it might have an effect if (load_relaxed_local(ptr) > value) { - atomicMin(reinterpret_cast(ptr), - static_cast(value)); + atomic_min_relaxed(ptr, value); } } @@ -114,8 +108,6 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum( IndexType* __restrict__ worklist_edge_ids, IndexType* __restrict__ worklist_counter) { - using atomic_type = std::conditional_t, - int32, unsigned long long>; const auto i = thread::get_thread_id_flat(); if (i >= size) { return; @@ -126,8 +118,7 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum( const auto source_rep = mst_find(parents, source); const auto target_rep = mst_find(parents, target); if (source_rep != target_rep) { - const auto out_i = static_cast(atomicAdd( - reinterpret_cast(worklist_counter), atomic_type{1})); + const auto out_i = atomic_add_relaxed(worklist_counter, 1); worklist_sources[out_i] = source_rep; worklist_targets[out_i] = target_rep; worklist_edge_ids[out_i] = edge_id; @@ -178,8 +169,7 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges( repeat = true; } } while (repeat); - const auto out_i = static_cast(atomicAdd( - reinterpret_cast(out_counter), atomic_type{1})); + const auto out_i = atomic_add_relaxed(out_counter, 1); out_sources[out_i] = edge_sources[edge_id]; out_targets[out_i] = edge_targets[edge_id]; } @@ -309,6 +299,7 @@ void compute_skeleton_tree(std::shared_ptr exec, } } const auto num_mst_edges = exec->copy_val_to_host(output_counter); + // two separate sort calls get turned into efficient RadixSort invocations thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows); thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges, out_cols); diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index 8a98ee822b8..d279a8ac9de 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -128,6 +128,30 @@ __device__ __forceinline__ void store_generic(ValueType* ptr, ValueType value) } +template +__device__ __forceinline__ ValueType atomic_add_relaxed(ValueType* ptr, + AddType value) +{ + return __atomic_fetch_add(ptr, value, __ATOMIC_RELAXED); +} + + +template +__device__ __forceinline__ ValueType atomic_min_relaxed(ValueType* ptr, + ValueType value) +{ + return __atomic_fetch_min(ptr, value, __ATOMIC_RELAXED); +} + + +template +__device__ __forceinline__ ValueType atomic_max_relaxed(ValueType* ptr, + ValueType value) +{ + return __atomic_fetch_max(ptr, value, __ATOMIC_RELAXED); +} + + template __device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr) { From 7ff5f5917198f6d70df21182abebe2c1224bd3ba Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 16 Jan 2025 12:57:44 +0100 Subject: [PATCH 26/26] improve atomic usage to avoid casts --- .../cuda_hip/components/memory.nvidia.hpp.inc | 238 ++++++++++++++++++ .../elimination_forest_kernels.cpp | 6 +- .../cuda_hip/matrix/csr_kernels.template.cpp | 6 +- common/cuda_hip/reorder/rcm_kernels.cpp | 36 +-- dev_tools/scripts/generate_cuda_memory_ptx.py | 22 +- hip/components/memory.hip.hpp | 21 ++ 6 files changed, 291 insertions(+), 38 deletions(-) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index b9cef35ac77..f77481cd793 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -153,6 +153,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr, } +__device__ __forceinline__ int32 atomic_cas_relaxed_shared(int32* ptr, + int32 old_val, + int32 new_val) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; @@ -241,6 +261,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr, } +__device__ __forceinline__ int64 atomic_cas_relaxed_shared(int64* ptr, + int64 old_val, + int64 new_val) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ float load_relaxed_shared(const float* ptr) { float result; @@ -467,6 +507,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr, } +__device__ __forceinline__ uint32 atomic_cas_relaxed_shared(uint32* ptr, + uint32 old_val, + uint32 new_val) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr) { uint64 result; @@ -593,6 +653,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr, } +__device__ __forceinline__ uint64 atomic_cas_relaxed_shared(uint64* ptr, + uint64 old_val, + uint64 new_val) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) { int32 result; @@ -873,6 +953,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr, } +__device__ __forceinline__ int32 atomic_cas_relaxed_local(int32* ptr, + int32 old_val, + int32 new_val) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) { int64 result; @@ -961,6 +1061,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr, } +__device__ __forceinline__ int64 atomic_cas_relaxed_local(int64* ptr, + int64 old_val, + int64 new_val) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ float load_relaxed_local(const float* ptr) { float result; @@ -1187,6 +1307,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr, } +__device__ __forceinline__ uint32 atomic_cas_relaxed_local(uint32* ptr, + uint32 old_val, + uint32 new_val) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr) { uint64 result; @@ -1313,6 +1453,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr, } +__device__ __forceinline__ uint64 atomic_cas_relaxed_local(uint64* ptr, + uint64 old_val, + uint64 new_val) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int32 load_acquire_local(const int32* ptr) { int32 result; @@ -1590,6 +1750,25 @@ __device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value) } +__device__ __forceinline__ int32 atomic_cas_relaxed(int32* ptr, int32 old_val, + int32 new_val) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int64 load_relaxed(const int64* ptr) { int64 result; @@ -1675,6 +1854,25 @@ __device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value) } +__device__ __forceinline__ int64 atomic_cas_relaxed(int64* ptr, int64 old_val, + int64 new_val) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ float load_relaxed(const float* ptr) { float result; @@ -1894,6 +2092,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value) } +__device__ __forceinline__ uint32 atomic_cas_relaxed(uint32* ptr, + uint32 old_val, + uint32 new_val) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ uint64 load_relaxed(const uint64* ptr) { uint64 result; @@ -2015,6 +2233,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value) } +__device__ __forceinline__ uint64 atomic_cas_relaxed(uint64* ptr, + uint64 old_val, + uint64 new_val) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int32 load_acquire(const int32* ptr) { int32 result; diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp index 553032dcf2c..52002e9a211 100644 --- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -139,8 +139,6 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges( IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets, IndexType* __restrict__ out_counter) { - using atomic_type = std::conditional_t, - int32, unsigned long long>; const auto i = thread::get_thread_id_flat(); if (i >= size) { return; @@ -159,9 +157,7 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges( do { repeat = false; auto old_parent = - atomicCAS(reinterpret_cast(parents + old_rep), - static_cast(old_rep), - static_cast(new_rep)); + atomic_cas_relaxed(parents + old_rep, old_rep, new_rep); // if this fails, the parent of old_rep changed recently, so we need // to try again by updating the parent's parent (hopefully its rep) if (old_parent != old_rep) { diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index bd2423d4306..8b3167eca97 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -35,6 +35,7 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/format_conversion.hpp" #include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "common/cuda_hip/components/merging.hpp" #include "common/cuda_hip/components/prefix_sum.hpp" #include "common/cuda_hip/components/reduction.hpp" @@ -1293,7 +1294,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( } #else if (i < row_len) { - while (atomicCAS(local_storage + hash, empty, i) != empty) { + while (atomic_cas_relaxed_local(local_storage + hash, empty, + static_cast(i)) != empty) { hash++; if (hash >= available_storage) { hash = 0; diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp index 2bb18cbdd22..96370647721 100644 --- a/common/cuda_hip/reorder/rcm_kernels.cpp +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -153,22 +153,6 @@ __device__ __forceinline__ IndexType disjoint_set_find(IndexType node, } -template -struct atomic_map {}; - - -template <> -struct atomic_map { - using type = int; -}; - - -template <> -struct atomic_map { - using type = unsigned long long; -}; - - template __global__ __launch_bounds__(default_block_size) void connected_components_combine( @@ -176,7 +160,6 @@ __launch_bounds__(default_block_size) void connected_components_combine( const IndexType* __restrict__ col_idxs, IndexType num_rows, IndexType* __restrict__ parents) { - using atomic_type = typename atomic_map::type; const auto row = thread::get_thread_id_flat(); if (row >= num_rows) { return; @@ -196,10 +179,8 @@ __launch_bounds__(default_block_size) void connected_components_combine( auto& max_parent = col_parent < parent ? parent : col_parent; // attempt to attach the (assumed unattached) larger node to the // smaller node - const auto old_parent = static_cast(atomicCAS( - reinterpret_cast(parents + max_parent), - static_cast(max_parent), - static_cast(min_parent))); + const auto old_parent = atomic_cas_relaxed( + parents + max_parent, max_parent, min_parent); // if unsuccessful, proceed with the parent of the (now known // attached) node if (old_parent != max_parent) { @@ -354,7 +335,6 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel( IndexType level, IndexType* __restrict__ node_levels, IndexType* __restrict__ level_nodes, IndexType* __restrict__ output_ptr) { - using atomic_type = typename atomic_map::type; const auto source = thread::get_thread_id_flat(); if (source >= num_sources) { return; @@ -362,17 +342,13 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel( const auto row = sources[source]; const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; - atomic_type unsigned_unattached{}; const auto unattached = invalid_index(); - memcpy(&unsigned_unattached, &unattached, sizeof(IndexType)); for (auto nz = begin; nz < end; nz++) { const auto col = col_idxs[nz]; if (node_levels[col] == unattached && - atomicCAS(reinterpret_cast(node_levels + col), - unsigned_unattached, - static_cast(level)) == unsigned_unattached) { - const auto output_pos = - atomicAdd(reinterpret_cast(output_ptr), 1); + atomic_cas_relaxed(node_levels + col, unattached, level) == + unattached) { + const auto output_pos = atomic_add_relaxed(output_ptr, 1); level_nodes[output_pos] = col; } } diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index f5c995a76a2..089cbf5c2dd 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -169,7 +169,6 @@ class operation: }} """) for op in operations: - if (not t.ptx_type_suffix.startswith(".f") or op.supports_float) and (not t.ptx_type_suffix.startswith(".s") or op.supports_signed): # for some reason there is no signed 64 bit atomic support, # but since the operations are equivalent in two's complement, we can use unsigned instead @@ -193,6 +192,27 @@ class operation: : "memory"); return result; }} +""") + # non-relaxed atomics are unsupported with SM 6.0 and below + if o.is_relaxed and not t.ptx_type_suffix.startswith(".f"): + new_type = ".b" + t.ptx_type_suffix[2:] + print(f""" +__device__ __forceinline__ {t.name} atomic_cas{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} old_val, {t.name} new_val) +{{ + {t.name} result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}.cas{new_type} %0, [%1], %2, %3;" +#else + "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;" +#endif + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(old_val), "{t.val_constraint}"(new_val) + : "memory"); + return result; +}} """) # vectorized relaxed loads for thrust::complex diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index d279a8ac9de..a05bf7de344 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -152,6 +152,27 @@ __device__ __forceinline__ ValueType atomic_max_relaxed(ValueType* ptr, } +template +__device__ __forceinline__ ValueType atomic_cas_relaxed(ValueType* ptr, + ValueType old_val, + ValueType new_val) +{ + __atomic_compare_exchange_n(ptr, &old_val, new_val, false, __ATOMIC_RELAXED, + __ATOMIC_RELAXED); + return old_val; +} + + +template +__device__ __forceinline__ ValueType atomic_cas_relaxed_local(ValueType* ptr, + ValueType old_val, + ValueType new_val) +{ + // no special optimization available for threadblock-local atomic CAS + return atomic_cas_relaxed(ptr, old_val, new_val); +} + + template __device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr) {