From e62b52fe781ba155f442dac4bf61144abfe7b522 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 18 Dec 2024 15:55:25 +0100
Subject: [PATCH 01/26] add ani4 to symbolic cholesky test set

---
 test/factorization/cholesky_kernels.cpp | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index ae2894d3696..6d352c135e7 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -88,6 +88,11 @@ class CholeskySymbolic : public CommonTestFixture {
         std::ifstream ani1_amd_stream{gko::matrices::location_ani1_amd_mtx};
         matrices.emplace_back("ani1_amd",
                               gko::read<matrix_type>(ani1_amd_stream, ref));
+        std::ifstream ani4_stream{gko::matrices::location_ani4_mtx};
+        matrices.emplace_back("ani4", gko::read<matrix_type>(ani4_stream, ref));
+        std::ifstream ani4_amd_stream{gko::matrices::location_ani4_amd_mtx};
+        matrices.emplace_back("ani4_amd",
+                              gko::read<matrix_type>(ani4_amd_stream, ref));
     }
 
     void assert_equal_forests(elimination_forest& lhs, elimination_forest& rhs,

From 016d0ea71f7509177033f0902d5baf2a5e5623d5 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 18 Dec 2024 15:56:35 +0100
Subject: [PATCH 02/26] fix sparsity pattern output

---
 core/test/utils/assertions.hpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index 3dae62151b3..13e5dfe0ea5 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -108,6 +108,20 @@ class biggest_valuetype<
 };
 
 
+template <typename NonzeroIterator>
+auto get_next_value_optional(NonzeroIterator& it, const NonzeroIterator& end,
+                             size_type next_row, size_type next_col)
+    -> std::optional<typename std::decay<decltype(it->value)>::type>
+{
+    if (it != end && it->row == next_row && it->column == next_col) {
+        return std::optional((it++)->value);
+    } else {
+        // monostate
+        return {};
+    }
+}
+
+
 template <typename NonzeroIterator>
 auto get_next_value(NonzeroIterator& it, const NonzeroIterator& end,
                     size_type next_row, size_type next_col) ->
@@ -190,11 +204,12 @@ void print_sparsity_pattern(Ostream& os, const MatrixData1& first,
         os << (row % 10);
         for (size_type col = 0; col < first.size[1]; col++) {
             const auto has_first =
-                get_next_value(first_it, end(first.nonzeros), row, col) !=
-                zero<typename MatrixData1::value_type>();
+                get_next_value_optional(first_it, end(first.nonzeros), row, col)
+                    .has_value();
             const auto has_second =
-                get_next_value(second_it, end(second.nonzeros), row, col) !=
-                zero<typename MatrixData2::value_type>();
+                get_next_value_optional(second_it, end(second.nonzeros), row,
+                                        col)
+                    .has_value();
             if (has_first) {
                 if (has_second) {
                     os << '+';

From 91f5b97181a3b268ee258c29b8a2979fc0c9b37d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 18 Dec 2024 15:56:54 +0100
Subject: [PATCH 03/26] fix incorrect include

---
 common/cuda_hip/factorization/cholesky_kernels.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 7ff1382d8c6..d5badf6ef43 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -45,9 +45,6 @@ namespace cholesky {
 constexpr int default_block_size = 512;
 
 
-#include "core/factorization/elimination_forest.hpp"
-
-
 namespace kernel {
 
 
@@ -393,6 +390,15 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,

From 50924cafe42aa6f5f13fae1382c97359e14b7942 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 18 Dec 2024 16:04:10 +0100
Subject: [PATCH 04/26] add Cholesky skeleton tree computation kernel

---
 core/device_hooks/common_kernels.inc.cpp      |  1 +
 core/factorization/cholesky_kernels.hpp       |  9 ++++
 dpcpp/factorization/cholesky_kernels.dp.cpp   |  9 ++++
 omp/factorization/cholesky_kernels.cpp        | 48 +++++++++++++++++++
 reference/factorization/cholesky_kernels.cpp  | 47 ++++++++++++++++++
 .../test/factorization/cholesky_kernels.cpp   | 31 ++++++++++++
 test/factorization/cholesky_kernels.cpp       | 24 ++++++++++
 7 files changed, 169 insertions(+)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 034bc73e388..668a4e0f6bf 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -927,6 +927,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 namespace cholesky {
 
 
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp
index 8230fa4552d..9a9a165e6b4 100644
--- a/core/factorization/cholesky_kernels.hpp
+++ b/core/factorization/cholesky_kernels.hpp
@@ -20,6 +20,13 @@ namespace gko {
 namespace kernels {
 
 
+#define GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType)               \
+    void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec, \
+                               const IndexType* row_ptrs,                   \
+                               const IndexType* cols, size_type size,       \
+                               IndexType* out_row_ptrs, IndexType* out_cols)
+
+
 #define GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType)        \
     void symbolic_count(                                                 \
         std::shared_ptr<const DefaultExecutor> exec,                     \
@@ -66,6 +73,8 @@ namespace kernels {
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                               \
+    template <typename IndexType>                                  \
+    GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType);         \
     template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>              \
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index 5cd8396be17..1461f918be7 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -25,6 +25,15 @@ namespace dpcpp {
 namespace cholesky {
 
 
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index aa4aabfc731..293bb70ff8b 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -9,6 +9,8 @@
 
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
@@ -28,6 +30,52 @@ namespace omp {
 namespace cholesky {
 
 
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    disjoint_sets<IndexType> sets(exec, size);
+    const auto nnz = static_cast<size_type>(row_ptrs[size]);
+    vector<std::pair<IndexType, IndexType>> edges(nnz, exec);
+    // collect edge list
+    for (auto row : irange(static_cast<IndexType>(size))) {
+        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
+            const auto col = cols[nz];
+            // edge contains (max, min) pair
+            auto edge = std::minmax(row, col, std::greater{});
+            edges.push_back(edge);
+        }
+    }
+    // sort edge list ascending by edge weight
+    std::sort(edges.begin(), edges.end());
+    // output helper array: Store row indices for output rows
+    // since we sorted by edge.first == row, this will be sorted
+    std::vector<IndexType> out_rows(size);
+    IndexType output_count{};
+    // Kruskal algorithm: Connect unconnected components using edges with
+    // ascending weight
+    for (const auto edge : edges) {
+        const auto first_rep = sets.find(edge.first);
+        const auto second_rep = sets.find(edge.second);
+        if (first_rep != second_rep) {
+            // we are only interested in the lower triangle, so we add an edge
+            // max -> min
+            out_rows[output_count] = edge.first;
+            out_cols[output_count] = edge.second;
+            output_count++;
+            sets.join(first_rep, second_rep);
+        }
+    }
+    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
+    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index e4d7112a15f..e67778c1797 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -11,6 +11,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
@@ -31,6 +32,52 @@ namespace reference {
 namespace cholesky {
 
 
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    disjoint_sets<IndexType> sets(exec, size);
+    const auto nnz = static_cast<size_type>(row_ptrs[size]);
+    vector<std::pair<IndexType, IndexType>> edges(nnz, exec);
+    // collect edge list
+    for (auto row : irange(static_cast<IndexType>(size))) {
+        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
+            const auto col = cols[nz];
+            // edge contains (max, min) pair
+            auto edge = std::minmax(row, col, std::greater{});
+            edges.push_back(edge);
+        }
+    }
+    // sort edge list ascending by edge weight
+    std::sort(edges.begin(), edges.end());
+    // output helper array: Store row indices for output rows
+    // since we sorted by edge.first == row, this will be sorted
+    std::vector<IndexType> out_rows(size);
+    IndexType output_count{};
+    // Kruskal algorithm: Connect unconnected components using edges with
+    // ascending weight
+    for (const auto edge : edges) {
+        const auto first_rep = sets.find(edge.first);
+        const auto second_rep = sets.find(edge.second);
+        if (first_rep != second_rep) {
+            // we are only interested in the lower triangle, so we add an edge
+            // max -> min
+            out_rows[output_count] = edge.first;
+            out_cols[output_count] = edge.second;
+            output_count++;
+            sets.join(first_rep, second_rep);
+        }
+    }
+    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
+    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 630aa2d0406..cf72c13a0b1 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -233,6 +233,37 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
+TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
+    using elimination_forest = typename TestFixture::elimination_forest;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices(
+        [this] {
+            auto skeleton = matrix_type::create(
+                this->ref, this->mtx->get_size(), this->mtx->get_size()[0]);
+            std::unique_ptr<elimination_forest> skeleton_forest;
+            gko::factorization::compute_elim_forest(this->mtx.get(),
+                                                    this->forest);
+
+            gko::kernels::reference::cholesky::compute_skeleton_tree(
+                this->ref, this->mtx->get_const_row_ptrs(),
+                this->mtx->get_const_col_idxs(), this->mtx->get_size()[0],
+                skeleton->get_row_ptrs(), skeleton->get_col_idxs());
+
+            gko::factorization::compute_elim_forest(this->mtx.get(),
+                                                    this->forest);
+            gko::factorization::compute_elim_forest(skeleton.get(),
+                                                    skeleton_forest);
+
+            this->assert_equal_forests(*skeleton_forest, *this->forest);
+        },
+        true);
+}
+
+
 TYPED_TEST(Cholesky, KernelSymbolicCount)
 {
     using matrix_type = typename TestFixture::matrix_type;
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 6d352c135e7..d408bc930f8 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -130,6 +130,30 @@ using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypesBase,
 TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator);
 
 
+TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using index_type = typename TestFixture::index_type;
+    for (const auto& pair : this->matrices) {
+        SCOPED_TRACE(pair.first);
+        const auto& mtx = pair.second;
+        const auto dmtx = gko::clone(this->exec, mtx);
+        const auto size = mtx->get_size();
+        auto skeleton = matrix_type::create(this->ref, size, size[0]);
+        auto dskeleton = matrix_type::create(this->exec, size, size[0]);
+
+        gko::kernels::reference::cholesky::compute_skeleton_tree(
+            this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            size[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs());
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::compute_skeleton_tree(
+            this->exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+            size[0], dskeleton->get_row_ptrs(), dskeleton->get_col_idxs());
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(skeleton, dskeleton);
+    }
+}
+
+
 TYPED_TEST(CholeskySymbolic, KernelSymbolicCount)
 {
     using matrix_type = typename TestFixture::matrix_type;

From 0bf134dbf450c98224483a9605cb5e3752260d3a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 5 Dec 2024 21:45:03 +0100
Subject: [PATCH 05/26] add more precise atomic operations

---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 1680 +++++++++++++++--
 dev_tools/scripts/generate_cuda_memory_ptx.py |   50 +-
 2 files changed, 1580 insertions(+), 150 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index f759c613f45..cf8d80c64e9 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -96,6 +96,63 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
 }
 
 
+__device__ __forceinline__ int32 atomic_add_relaxed_shared(int32* ptr,
+                                                           int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_min_relaxed_shared(int32* ptr,
+                                                           int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr,
+                                                           int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 {
     int64 result;
@@ -127,6 +184,63 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
 }
 
 
+__device__ __forceinline__ int64 atomic_add_relaxed_shared(int64* ptr,
+                                                           int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_min_relaxed_shared(int64* ptr,
+                                                           int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr,
+                                                           int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 {
     float result;
@@ -158,6 +272,25 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result)
 }
 
 
+__device__ __forceinline__ float atomic_add_relaxed_shared(float* ptr,
+                                                           float value)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.f32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.f32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.f32 %0, [%1], %2;"
+#endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "f"(value)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ double load_relaxed_shared(const double* ptr)
 {
     double result;
@@ -189,6 +322,277 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result)
 }
 
 
+__device__ __forceinline__ double atomic_add_relaxed_shared(double* ptr,
+                                                            double value)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.f64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.f64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.f64 %0, [%1], %2;"
+#endif
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "d"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 load_relaxed_shared(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.u32 %0, [%1];"
+#else
+        "ld.relaxed.cta.shared.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint32*>(ptr)))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(uint32* ptr, uint32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.u32 [%0], %1;"
+#else
+        "st.relaxed.cta.shared.u32 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint32 atomic_add_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_min_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_max_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_and_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.and.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.and.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.and.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.or.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.or.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.or.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.u64 %0, [%1];"
+#else
+        "ld.relaxed.cta.shared.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint64*>(ptr)))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(uint64* ptr, uint64 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.u64 [%0], %1;"
+#else
+        "st.relaxed.cta.shared.u64 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 atomic_add_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_min_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_max_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_and_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.and.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.and.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.and.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.or.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.or.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.or.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 {
     int32 result;
@@ -317,76 +721,968 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result)
 }
 
 
-__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
+__device__ __forceinline__ uint32 load_acquire_shared(const uint32* ptr)
 {
-    int32 result;
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.u32 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint32*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(uint32* ptr, uint32 result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.u32 [%0], %1;"
+#else
+        "st.release.cta.shared.u32 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 load_acquire_shared(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.u64 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint64*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(uint64* ptr, uint64 result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.u64 [%0], %1;"
+#else
+        "st.release.cta.shared.u64 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s32 %0, [%1];"
+#else
+        "ld.relaxed.cta.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s32 [%0], %1;"
+#else
+        "st.relaxed.cta.s32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 atomic_add_relaxed_local(int32* ptr,
+                                                          int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_min_relaxed_local(int32* ptr,
+                                                          int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr,
+                                                          int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s64 %0, [%1];"
+#else
+        "ld.relaxed.cta.s64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s64 [%0], %1;"
+#else
+        "st.relaxed.cta.s64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int64 atomic_add_relaxed_local(int64* ptr,
+                                                          int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_min_relaxed_local(int64* ptr,
+                                                          int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr,
+                                                          int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ float load_relaxed_local(const float* ptr)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f32 %0, [%1];"
+#else
+        "ld.relaxed.cta.f32 %0, [%1];"
+#endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f32 [%0], %1;"
+#else
+        "st.relaxed.cta.f32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ float atomic_add_relaxed_local(float* ptr,
+                                                          float value)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.f32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.f32 %0, [%1], %2;"
+#endif
+        : "=f"(result)
+        : "l"(ptr), "f"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ double load_relaxed_local(const double* ptr)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f64 %0, [%1];"
+#else
+        "ld.relaxed.cta.f64 %0, [%1];"
+#endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f64 [%0], %1;"
+#else
+        "st.relaxed.cta.f64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ double atomic_add_relaxed_local(double* ptr,
+                                                           double value)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.f64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.f64 %0, [%1], %2;"
+#endif
+        : "=d"(result)
+        : "l"(ptr), "d"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 load_relaxed_local(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u32 %0, [%1];"
+#else
+        "ld.relaxed.cta.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(uint32* ptr, uint32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u32 [%0], %1;"
+#else
+        "st.relaxed.cta.u32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint32 atomic_add_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_min_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_max_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_and_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.and.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.and.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.and.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr,
+                                                          uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.or.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.or.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.or.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u64 %0, [%1];"
+#else
+        "ld.relaxed.cta.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(uint64* ptr, uint64 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u64 [%0], %1;"
+#else
+        "st.relaxed.cta.u64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 atomic_add_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_min_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_max_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_and_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.and.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.and.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.and.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr,
+                                                          uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.or.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.or.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.or.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s32 %0, [%1];"
+#else
+        "ld.acquire.cta.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s32 [%0], %1;"
+#else
+        "st.release.cta.s32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s64 %0, [%1];"
+#else
+        "ld.acquire.cta.s64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s64 [%0], %1;"
+#else
+        "st.release.cta.s64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ float load_acquire_local(const float* ptr)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f32 %0, [%1];"
+#else
+        "ld.acquire.cta.f32 %0, [%1];"
+#endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(float* ptr, float result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f32 [%0], %1;"
+#else
+        "st.release.cta.f32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ double load_acquire_local(const double* ptr)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f64 %0, [%1];"
+#else
+        "ld.acquire.cta.f64 %0, [%1];"
+#endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(double* ptr, double result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f64 [%0], %1;"
+#else
+        "st.release.cta.f64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint32 load_acquire_local(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u32 %0, [%1];"
+#else
+        "ld.acquire.cta.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(uint32* ptr, uint32 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u32 [%0], %1;"
+#else
+        "st.release.cta.u32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 load_acquire_local(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u64 %0, [%1];"
+#else
+        "ld.acquire.cta.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(uint64* ptr, uint64 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u64 [%0], %1;"
+#else
+        "st.release.cta.u64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 load_relaxed(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s32 %0, [%1];"
+#else
+        "ld.relaxed.gpu.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s32 [%0], %1;"
+#else
+        "st.relaxed.gpu.s32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 atomic_add_relaxed(int32* ptr, int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.add.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_min_relaxed(int32* ptr, int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.min.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.max.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 load_relaxed(const int64* ptr)
+{
+    int64 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.s32 %0, [%1];"
+        "ld.volatile.s64 %0, [%1];"
 #else
-        "ld.relaxed.cta.s32 %0, [%1];"
+        "ld.relaxed.gpu.s64 %0, [%1];"
 #endif
-        : "=r"(result)
-        : "l"(const_cast<int32*>(ptr))
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
         : "memory");
 
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
+__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
 {
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.s32 [%0], %1;"
+        "st.volatile.s64 [%0], %1;"
 #else
-        "st.relaxed.cta.s32 [%0], %1;"
+        "st.relaxed.gpu.s64 [%0], %1;"
 #endif
         ::"l"(ptr),
-        "r"(result)
+        "l"(result)
         : "memory");
 }
 
 
-__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
+__device__ __forceinline__ int64 atomic_add_relaxed(int64* ptr, int64 value)
 {
     int64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.cta.s64 %0, [%1];"
+        "atom.relaxed.gpu.add.u64 %0, [%1], %2;"
 #endif
         : "=l"(result)
-        : "l"(const_cast<int64*>(ptr))
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
+}
+
 
+__device__ __forceinline__ int64 atomic_min_relaxed(int64* ptr, int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.min.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
+__device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value)
 {
+    int64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.s64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.max.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.s64 %0, [%1], %2;"
 #else
-        "st.relaxed.cta.s64 [%0], %1;"
+        "atom.relaxed.gpu.max.s64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "l"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ float load_relaxed_local(const float* ptr)
+__device__ __forceinline__ float load_relaxed(const float* ptr)
 {
     float result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "ld.volatile.f32 %0, [%1];"
 #else
-        "ld.relaxed.cta.f32 %0, [%1];"
+        "ld.relaxed.gpu.f32 %0, [%1];"
 #endif
         : "=f"(result)
         : "l"(const_cast<float*>(ptr))
@@ -396,13 +1692,13 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
+__device__ __forceinline__ void store_relaxed(float* ptr, float result)
 {
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "st.volatile.f32 [%0], %1;"
 #else
-        "st.relaxed.cta.f32 [%0], %1;"
+        "st.relaxed.gpu.f32 [%0], %1;"
 #endif
         ::"l"(ptr),
         "f"(result)
@@ -410,14 +1706,32 @@ __device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
 }
 
 
-__device__ __forceinline__ double load_relaxed_local(const double* ptr)
+__device__ __forceinline__ float atomic_add_relaxed(float* ptr, float value)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.f32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.add.f32 %0, [%1], %2;"
+#endif
+        : "=f"(result)
+        : "l"(ptr), "f"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ double load_relaxed(const double* ptr)
 {
     double result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "ld.volatile.f64 %0, [%1];"
 #else
-        "ld.relaxed.cta.f64 %0, [%1];"
+        "ld.relaxed.gpu.f64 %0, [%1];"
 #endif
         : "=d"(result)
         : "l"(const_cast<double*>(ptr))
@@ -427,13 +1741,13 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr)
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
+__device__ __forceinline__ void store_relaxed(double* ptr, double result)
 {
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "st.volatile.f64 [%0], %1;"
 #else
-        "st.relaxed.cta.f64 [%0], %1;"
+        "st.relaxed.gpu.f64 [%0], %1;"
 #endif
         ::"l"(ptr),
         "d"(result)
@@ -441,255 +1755,263 @@ __device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
 }
 
 
-__device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
+__device__ __forceinline__ double atomic_add_relaxed(double* ptr, double value)
 {
-    int32 result;
+    double result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.add.f64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.f64 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.s32 %0, [%1];"
+        "atom.relaxed.gpu.add.f64 %0, [%1], %2;"
 #endif
-        : "=r"(result)
-        : "l"(const_cast<int32*>(ptr))
+        : "=d"(result)
+        : "l"(ptr), "d"(value)
         : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
+__device__ __forceinline__ uint32 load_relaxed(const uint32* ptr)
 {
-    membar_acq_rel_local();
+    uint32 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.s32 [%0], %1;"
+        "ld.volatile.u32 %0, [%1];"
 #else
-        "st.release.cta.s32 [%0], %1;"
+        "ld.relaxed.gpu.u32 %0, [%1];"
 #endif
-        ::"l"(ptr),
-        "r"(result)
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
         : "memory");
-}
-
 
-__device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
-{
-    int64 result;
-    asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s64 %0, [%1];"
-#else
-        "ld.acquire.cta.s64 %0, [%1];"
-#endif
-        : "=l"(result)
-        : "l"(const_cast<int64*>(ptr))
-        : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
+__device__ __forceinline__ void store_relaxed(uint32* ptr, uint32 result)
 {
-    membar_acq_rel_local();
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.s64 [%0], %1;"
+        "st.volatile.u32 [%0], %1;"
 #else
-        "st.release.cta.s64 [%0], %1;"
+        "st.relaxed.gpu.u32 [%0], %1;"
 #endif
         ::"l"(ptr),
-        "l"(result)
+        "r"(result)
         : "memory");
 }
 
 
-__device__ __forceinline__ float load_acquire_local(const float* ptr)
+__device__ __forceinline__ uint32 atomic_add_relaxed(uint32* ptr, uint32 value)
 {
-    float result;
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.add.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.u32 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.f32 %0, [%1];"
+        "atom.relaxed.gpu.add.u32 %0, [%1], %2;"
 #endif
-        : "=f"(result)
-        : "l"(const_cast<float*>(ptr))
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(float* ptr, float result)
+__device__ __forceinline__ uint32 atomic_min_relaxed(uint32* ptr, uint32 value)
 {
-    membar_acq_rel_local();
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f32 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.min.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.u32 %0, [%1], %2;"
 #else
-        "st.release.cta.f32 [%0], %1;"
+        "atom.relaxed.gpu.min.u32 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "f"(result)
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ double load_acquire_local(const double* ptr)
+__device__ __forceinline__ uint32 atomic_max_relaxed(uint32* ptr, uint32 value)
 {
-    double result;
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.max.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.u32 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.f64 %0, [%1];"
+        "atom.relaxed.gpu.max.u32 %0, [%1], %2;"
 #endif
-        : "=d"(result)
-        : "l"(const_cast<double*>(ptr))
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(double* ptr, double result)
+__device__ __forceinline__ uint32 atomic_and_relaxed(uint32* ptr, uint32 value)
 {
-    membar_acq_rel_local();
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.and.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.and.u32 %0, [%1], %2;"
 #else
-        "st.release.cta.f64 [%0], %1;"
+        "atom.relaxed.gpu.and.u32 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "d"(result)
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ int32 load_relaxed(const int32* ptr)
+__device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value)
 {
-    int32 result;
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.or.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.or.u32 %0, [%1], %2;"
 #else
-        "ld.relaxed.gpu.s32 %0, [%1];"
+        "atom.relaxed.gpu.or.u32 %0, [%1], %2;"
 #endif
         : "=r"(result)
-        : "l"(const_cast<int32*>(ptr))
+        : "l"(ptr), "r"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
+__device__ __forceinline__ uint64 load_relaxed(const uint64* ptr)
 {
+    uint64 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.s32 [%0], %1;"
+        "ld.volatile.u64 %0, [%1];"
 #else
-        "st.relaxed.gpu.s32 [%0], %1;"
+        "ld.relaxed.gpu.u64 %0, [%1];"
 #endif
-        ::"l"(ptr),
-        "r"(result)
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
         : "memory");
+
+    return result;
 }
 
 
-__device__ __forceinline__ int64 load_relaxed(const int64* ptr)
+__device__ __forceinline__ void store_relaxed(uint64* ptr, uint64 result)
 {
-    int64 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.s64 %0, [%1];"
+        "st.volatile.u64 [%0], %1;"
 #else
-        "ld.relaxed.gpu.s64 %0, [%1];"
+        "st.relaxed.gpu.u64 [%0], %1;"
 #endif
-        : "=l"(result)
-        : "l"(const_cast<int64*>(ptr))
+        ::"l"(ptr),
+        "l"(result)
         : "memory");
-
-    return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
+__device__ __forceinline__ uint64 atomic_add_relaxed(uint64* ptr, uint64 value)
 {
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.s64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.u64 %0, [%1], %2;"
 #else
-        "st.relaxed.gpu.s64 [%0], %1;"
+        "atom.relaxed.gpu.add.u64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "l"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ float load_relaxed(const float* ptr)
+__device__ __forceinline__ uint64 atomic_min_relaxed(uint64* ptr, uint64 value)
 {
-    float result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.min.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.gpu.f32 %0, [%1];"
+        "atom.relaxed.gpu.min.u64 %0, [%1], %2;"
 #endif
-        : "=f"(result)
-        : "l"(const_cast<float*>(ptr))
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(float* ptr, float result)
+__device__ __forceinline__ uint64 atomic_max_relaxed(uint64* ptr, uint64 value)
 {
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f32 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.max.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.u64 %0, [%1], %2;"
 #else
-        "st.relaxed.gpu.f32 [%0], %1;"
+        "atom.relaxed.gpu.max.u64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "f"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ double load_relaxed(const double* ptr)
+__device__ __forceinline__ uint64 atomic_and_relaxed(uint64* ptr, uint64 value)
 {
-    double result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.and.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.and.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.gpu.f64 %0, [%1];"
+        "atom.relaxed.gpu.and.u64 %0, [%1], %2;"
 #endif
-        : "=d"(result)
-        : "l"(const_cast<double*>(ptr))
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(double* ptr, double result)
+__device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value)
 {
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.or.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.or.u64 %0, [%1], %2;"
 #else
-        "st.relaxed.gpu.f64 [%0], %1;"
+        "atom.relaxed.gpu.or.u64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "d"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
@@ -821,6 +2143,70 @@ __device__ __forceinline__ void store_release(double* ptr, double result)
 }
 
 
+__device__ __forceinline__ uint32 load_acquire(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u32 %0, [%1];"
+#else
+        "ld.acquire.gpu.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
+        : "memory");
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(uint32* ptr, uint32 result)
+{
+    membar_acq_rel();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u32 [%0], %1;"
+#else
+        "st.release.gpu.u32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 load_acquire(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u64 %0, [%1];"
+#else
+        "ld.acquire.gpu.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
+        : "memory");
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(uint64* ptr, uint64 result)
+{
+    membar_acq_rel();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u64 [%0], %1;"
+#else
+        "st.release.gpu.u64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
 __device__ __forceinline__ thrust::complex<float> load_relaxed_shared(
     const thrust::complex<float>* ptr)
 {
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index 834c49dba46..ed6a6cb32cf 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -17,6 +17,8 @@ class ordering:
     fn_load_suffix: str
     ptx_store_suffix: str
     fn_store_suffix: str
+    ptx_loadstore_suffix: str
+    fn_loadstore_suffix: str
     is_relaxed: bool
 
 
@@ -27,6 +29,13 @@ class type_desc:
     name: str
 
 
+@dataclasses.dataclass
+class operation:
+    fn_op_suffix: str
+    ptx_op_suffix: str
+    supports_float: bool
+    supports_signed: bool
+
 memory_spaces = [
     space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared",
           ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"),
@@ -35,14 +44,23 @@ class type_desc:
     space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")]
 memory_orderings = [
     ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
-             ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True),
+             ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", 
+             ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True),
     ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire",
-             ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False)
+             ptx_store_suffix=".release", fn_store_suffix="_release",
+             ptx_loadstore_suffix=".acq_rel", fn_loadstore_suffix="_acqrel", is_relaxed=False)
 ]
 types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"),
          type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"),
          type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"),
-         type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")]
+         type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double"),
+         type_desc(ptx_type_suffix=".u32", val_constraint="r", name="uint32"),
+         type_desc(ptx_type_suffix=".u64", val_constraint="l", name="uint64")]
+operations = [operation(fn_op_suffix="_add", ptx_op_suffix=".add", supports_float=True, supports_signed=True),
+              operation(fn_op_suffix="_min", ptx_op_suffix=".min", supports_float=False, supports_signed=True),
+              operation(fn_op_suffix="_max", ptx_op_suffix=".max", supports_float=False, supports_signed=True),
+              operation(fn_op_suffix="_and", ptx_op_suffix=".and", supports_float=False, supports_signed=False),
+              operation(fn_op_suffix="_or", ptx_op_suffix=".or", supports_float=False, supports_signed=False)]
 # header
 print("""// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
 //
@@ -149,6 +167,32 @@ class type_desc:
         :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
         : "memory");
 }}
+""")
+            for op in operations:
+
+                if (not t.ptx_type_suffix.startswith(".f") or op.supports_float) and (not t.ptx_type_suffix.startswith(".s") or op.supports_signed):
+                    # for some reason there is no signed 64 bit atomic support,
+                    # but since the operations are equivalent in two's complement, we can use unsigned instead
+                    new_type = ".u64" if t.ptx_type_suffix == ".s64" and op.fn_op_suffix == "_add" else t.ptx_type_suffix
+                    # non-relaxed atomics are unsupported with SM 6.0 and below
+                    if o.is_relaxed:
+                        print(f"""
+__device__ __forceinline__ {t.name} atomic{op.fn_op_suffix}{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} value)
+{{
+    {t.name} result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;"
+#else
+        "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;"
+#endif
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(value)
+        : "memory");
+    return result;
+}}
 """)
 
 # vectorized relaxed loads for thrust::complex

From 9caaa2d0b4eace5e302091d110d16a5ff5aa1048 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 19 Dec 2024 19:47:58 +0100
Subject: [PATCH 06/26] fix matrix symmetry

---
 test/factorization/cholesky_kernels.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index d408bc930f8..24cb4ae9b6a 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -45,14 +45,17 @@ class CholeskySymbolic : public CommonTestFixture {
         matrices.emplace_back(
             "example small",
             gko::initialize<matrix_type>(
-                {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 0, 0, 1}}, ref));
+                {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 1, 0, 1}}, ref));
+        // this is the example from Liu 1990 https://doi.org/10.1137/0611010.
+        // "The Role of Elimination Trees in Sparse Factorization"
+
         matrices.emplace_back("example", gko::initialize<matrix_type>(
                                              {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0},
-                                              {0, 1, 0, 1, 0, 0, 0, 0, 0, 1},
-                                              {1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+                                              {0, 1, 0, 0, 1, 0, 0, 0, 0, 1},
+                                              {1, 0, 1, 0, 0, 0, 1, 0, 0, 0},
                                               {0, 0, 0, 1, 0, 0, 0, 0, 1, 1},
                                               {0, 1, 0, 0, 1, 0, 0, 0, 1, 1},
-                                              {0, 0, 0, 0, 0, 1, 0, 1, 0, 0},
+                                              {0, 0, 0, 0, 0, 1, 1, 1, 0, 0},
                                               {0, 0, 1, 0, 0, 1, 1, 0, 0, 0},
                                               {1, 0, 0, 0, 0, 1, 0, 1, 1, 1},
                                               {0, 0, 0, 1, 1, 0, 0, 1, 1, 0},
@@ -65,7 +68,7 @@ class CholeskySymbolic : public CommonTestFixture {
                                                 {0, 0, 0, 1, 1, 0, 0, 0, 0, 0},
                                                 {0, 0, 0, 1, 1, 1, 0, 0, 0, 1},
                                                 {0, 0, 0, 0, 1, 1, 0, 0, 0, 0},
-                                                {0, 0, 0, 0, 0, 0, 1, 0, 0, 1},
+                                                {0, 0, 0, 0, 0, 0, 1, 1, 0, 1},
                                                 {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
                                                 {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
                                                 {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}},

From f2d970c4750ad6eb08c73a8fd11a77fd9055fc18 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 19 Dec 2024 20:08:29 +0100
Subject: [PATCH 07/26] fix reference MST algorithm

---
 reference/factorization/cholesky_kernels.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index e67778c1797..94d8903ca07 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -40,7 +40,8 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
 {
     disjoint_sets<IndexType> sets(exec, size);
     const auto nnz = static_cast<size_type>(row_ptrs[size]);
-    vector<std::pair<IndexType, IndexType>> edges(nnz, exec);
+    vector<std::pair<IndexType, IndexType>> edges(exec);
+    edges.reserve(nnz);
     // collect edge list
     for (auto row : irange(static_cast<IndexType>(size))) {
         for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {

From 0af05e2adb63d3ebf443f36a944152d636d61ec3 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 19 Dec 2024 20:24:50 +0100
Subject: [PATCH 08/26] add GPU MST algorithm for Cholesky preprocessing

---
 .../factorization/cholesky_kernels.cpp        | 271 +++++++++++++++++-
 test/factorization/cholesky_kernels.cpp       |  49 +++-
 2 files changed, 308 insertions(+), 12 deletions(-)

diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index d5badf6ef43..61a1012e435 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -5,6 +5,7 @@
 #include "core/factorization/cholesky_kernels.hpp"
 
 #include <algorithm>
+#include <limits>
 #include <memory>
 
 #include <thrust/execution_policy.h>
@@ -21,6 +22,7 @@
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/syncfree.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
@@ -48,6 +50,163 @@ constexpr int default_block_size = 512;
 namespace kernel {
 
 
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
+    const IndexType* __restrict__ rows, const IndexType* __restrict__ cols,
+    IndexType size, IndexType* __restrict__ worklist_sources,
+    IndexType* __restrict__ worklist_targets,
+    IndexType* __restrict__ worklist_edge_ids,
+    IndexType* __restrict__ worklist_counter)
+{
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto row = rows[i];
+    const auto col = cols[i];
+    if (col < row) {
+        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
+        worklist_sources[out_i] = row;
+        worklist_targets[out_i] = col;
+        worklist_edge_ids[out_i] = i;
+    }
+}
+
+
+template <typename IndexType>
+__device__ IndexType mst_find(const IndexType* parents, IndexType node)
+{
+    auto parent = parents[node];
+    while (parent != node) {
+        node = parent;
+        parent = parents[node];
+    };
+    return parent;
+}
+
+
+template <typename IndexType>
+__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node)
+{
+    auto parent = load_relaxed_local(parents + node);
+    while (parent != node) {
+        node = parent;
+        parent = load_relaxed_local(parents + node);
+    };
+    return parent;
+}
+
+
+template <typename IndexType>
+__device__ void guarded_atomic_min(IndexType* ptr, IndexType value)
+{
+    // only execute the atomic if we know that it might have an effect
+    if (load_relaxed_local(ptr) > value) {
+        atomic_min_relaxed(ptr, value);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_find_minimum(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets,
+    const IndexType* __restrict__ in_edge_ids, IndexType size,
+    const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge,
+    IndexType* __restrict__ worklist_sources,
+    IndexType* __restrict__ worklist_targets,
+    IndexType* __restrict__ worklist_edge_ids,
+    IndexType* __restrict__ worklist_counter)
+{
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    const auto edge_id = in_edge_ids[i];
+    const auto source_rep = mst_find(parents, source);
+    const auto target_rep = mst_find(parents, target);
+    if (source_rep != target_rep) {
+        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
+        worklist_sources[out_i] = source_rep;
+        worklist_targets[out_i] = target_rep;
+        worklist_edge_ids[out_i] = edge_id;
+        guarded_atomic_min(min_edge + source_rep, edge_id);
+        guarded_atomic_min(min_edge + target_rep, edge_id);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_join_edges(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets,
+    const IndexType* __restrict__ in_edge_ids, IndexType size,
+    IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge,
+    const IndexType* __restrict__ edge_sources,
+    const IndexType* __restrict__ edge_targets,
+    IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
+    IndexType* __restrict__ out_counter)
+{
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    const auto edge_id = in_edge_ids[i];
+    if (min_edge[source] == edge_id || min_edge[target] == edge_id) {
+        // join source and sink
+        const auto source_rep = mst_find_relaxed(parents, source);
+        const auto target_rep = mst_find_relaxed(parents, target);
+        assert(source_rep != target_rep);
+        const auto new_rep = min(source_rep, target_rep);
+        auto old_rep = max(source_rep, target_rep);
+        bool repeat = false;
+        do {
+            repeat = false;
+            using atomic_type =
+                std::conditional_t<std::is_same_v<IndexType, int32>, int32,
+                                   unsigned long long>;
+            auto old_parent =
+                atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
+                          static_cast<atomic_type>(old_rep),
+                          static_cast<atomic_type>(new_rep));
+            // if this fails, the parent of old_rep changed recently, so we need
+            // to try again by updating the parent's parent (hopefully its rep)
+            if (old_parent != old_rep) {
+                old_rep = old_parent;
+                repeat = true;
+            }
+        } while (repeat);
+        const auto out_i = atomic_add_relaxed(out_counter, 1);
+        out_sources[out_i] = edge_sources[edge_id];
+        out_targets[out_i] = edge_targets[edge_id];
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets, IndexType size,
+    IndexType* __restrict__ min_edge)
+{
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    // we could write the values non-atomically, but this makes race checkers
+    // happier without a performance penalty (hopefully, thanks to _local)
+    store_relaxed_local(min_edge + source, sentinel);
+    store_relaxed_local(min_edge + target, sentinel);
+}
+
+
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void build_postorder_cols(
     IndexType num_rows, const IndexType* cols, const IndexType* row_ptrs,
@@ -230,6 +389,109 @@ __global__ __launch_bounds__(default_block_size) void factorize(
 }  // namespace kernel
 
 
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    const auto policy = thrust_policy(exec);
+    const auto nnz = exec->copy_val_to_host(row_ptrs + size);
+    // convert edges to COO representation
+    // the edge list is sorted, since we only consider edges where row > col,
+    // and the row array (= weights) is sorted coming from row_ptrs
+    array<IndexType> row_array{exec, static_cast<size_type>(nnz)};
+    const auto rows = row_array.get_data();
+    components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows);
+    // we assume the matrix is symmetric, so we can remove every second edge
+    // also round up the worklist size for equal cache alignment between fields
+    const auto worklist_size =
+        ceildiv(nnz, config::warp_size * 2) * config::warp_size;
+    // create 2 worklists consisting of (start, end, edge_id)
+    array<IndexType> worklist{exec, static_cast<size_type>(worklist_size * 8)};
+    auto wl1_source = worklist.get_data();
+    auto wl1_target = wl1_source + worklist_size;
+    auto wl1_edge_id = wl1_target + worklist_size;
+    auto wl2_source = wl1_source + 3 * worklist_size;
+    auto wl2_target = wl1_target + 3 * worklist_size;
+    auto wl2_edge_id = wl1_edge_id + 3 * worklist_size;
+    // atomic counters for worklists and output edge list
+    array<IndexType> counters{exec, 3};
+    auto wl1_counter = counters.get_data();
+    auto wl2_counter = wl1_counter + 1;
+    auto output_counter = wl2_counter + 1;
+    components::fill_array(exec, counters.get_data(), counters.get_size(),
+                           IndexType{});
+    // helpers for interacting with worklists
+    const auto clear_wl1 = [&] {
+        IndexType value{};
+        exec->copy_from(exec->get_master(), 1, &value, wl1_counter);
+    };
+    const auto clear_wl2 = [&] {
+        IndexType value{};
+        exec->copy_from(exec->get_master(), 1, &value, wl2_counter);
+    };
+    const auto get_wl1_size = [&] {
+        return exec->copy_val_to_host(wl1_counter);
+    };
+    const auto swap_wl1_wl2 = [&] {
+        std::swap(wl1_source, wl2_source);
+        std::swap(wl1_target, wl2_target);
+        std::swap(wl1_edge_id, wl2_edge_id);
+        std::swap(wl1_counter, wl2_counter);
+    };
+    // initialize every node to a singleton set
+    array<IndexType> parent_array{exec, size};
+    const auto parents = parent_array.get_data();
+    components::fill_seq_array(exec, parents, size);
+    // array storing the minimal edge adjacent to each node
+    array<IndexType> min_edge_array{exec, size};
+    const auto min_edges = min_edge_array.get_data();
+    constexpr auto min_edge_sentinel = std::numeric_limits<IndexType>::max();
+    components::fill_array(exec, min_edges, size, min_edge_sentinel);
+    // output row array, to be used in conjunction with out_cols in COO storage
+    array<IndexType> out_row_array{exec, size};
+    const auto out_rows = out_row_array.get_data();
+    // initialize worklist1 with forward edges
+    {
+        const auto num_blocks = ceildiv(nnz, default_block_size);
+        kernel::mst_initialize_worklist<<<num_blocks, default_block_size>>>(
+            rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter);
+    }
+    auto wl1_size = get_wl1_size();
+    while (wl1_size > 0) {
+        clear_wl2();
+        // attach each node to its smallest adjacent non-cycle edge
+        {
+            const auto num_blocks = ceildiv(wl1_size, default_block_size);
+            kernel::mst_find_minimum<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
+                min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter);
+        }
+        clear_wl1();
+        swap_wl1_wl2();
+        wl1_size = get_wl1_size();
+        if (wl1_size > 0) {
+            // join minimal edges
+            const auto num_blocks = ceildiv(wl1_size, default_block_size);
+            kernel::mst_join_edges<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
+                min_edges, rows, cols, out_rows, out_cols, output_counter);
+            kernel::mst_reset_min_edges<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_size, min_edges);
+        }
+    }
+    const auto num_mst_edges = exec->copy_val_to_host(output_counter);
+    thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows);
+    thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges,
+                               out_cols);
+    components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_factorize(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -390,15 +652,6 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
 
-template <typename IndexType>
-void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
-                           const IndexType* row_ptrs, const IndexType* cols,
-                           size_type size, IndexType* out_row_ptrs,
-                           IndexType* out_cols) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
-
-
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 24cb4ae9b6a..a08b71d0959 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -14,6 +14,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
+#include "core/components/disjoint_sets.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
@@ -111,6 +112,27 @@ class CholeskySymbolic : public CommonTestFixture {
         }
     }
 
+    index_type check_mst(const std::unique_ptr<matrix_type>& mst) const
+    {
+        gko::matrix_data<value_type, index_type> mst_data;
+        const auto size = mst->get_size();
+        mst->write(mst_data);
+        gko::disjoint_sets<index_type> sets(this->ref, size[0]);
+        index_type weight_sum{};
+        // need an IIFE because the assertions would return something
+        [&] {
+            for (const auto entry : mst_data.nonzeros) {
+                ASSERT_GT(entry.row, entry.column);
+                weight_sum += entry.row;
+                const auto row_rep = sets.find(entry.row);
+                const auto col_rep = sets.find(entry.column);
+                ASSERT_NE(row_rep, col_rep);
+                sets.join(row_rep, col_rep);
+            }
+        }();
+        return weight_sum;
+    }
+
     std::vector<std::pair<std::string, std::unique_ptr<const matrix_type>>>
         matrices;
     gko::array<index_type> tmp;
@@ -136,14 +158,18 @@ TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator);
 TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
 {
     using matrix_type = typename TestFixture::matrix_type;
+    using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
+    using elimination_forest = typename TestFixture::elimination_forest;
     for (const auto& pair : this->matrices) {
         SCOPED_TRACE(pair.first);
         const auto& mtx = pair.second;
+        // check for correctness: is mtx symmetric?
+        GKO_ASSERT_MTX_EQ_SPARSITY(mtx, gko::as<matrix_type>(mtx->transpose()));
         const auto dmtx = gko::clone(this->exec, mtx);
         const auto size = mtx->get_size();
-        auto skeleton = matrix_type::create(this->ref, size, size[0]);
-        auto dskeleton = matrix_type::create(this->exec, size, size[0]);
+        const auto skeleton = matrix_type::create(this->ref, size, size[0]);
+        const auto dskeleton = matrix_type::create(this->exec, size, size[0]);
 
         gko::kernels::reference::cholesky::compute_skeleton_tree(
             this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
@@ -152,7 +178,24 @@ TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
             this->exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
             size[0], dskeleton->get_row_ptrs(), dskeleton->get_col_idxs());
 
-        GKO_ASSERT_MTX_EQ_SPARSITY(skeleton, dskeleton);
+        // check that the created graphs are trees and have the same edge sum
+        const auto weight_sum = this->check_mst(skeleton);
+        const auto dweight_sum = this->check_mst(dskeleton);
+        ASSERT_EQ(weight_sum, dweight_sum);
+        // while the MSTs may be different, they should produce the same
+        // elimination forest as the original matrix
+        std::unique_ptr<elimination_forest> forest;
+        std::unique_ptr<elimination_forest> skeleton_forest;
+        std::unique_ptr<elimination_forest> dskeleton_forest;
+        gko::factorization::compute_elim_forest(mtx.get(), forest);
+        gko::factorization::compute_elim_forest(skeleton.get(),
+                                                skeleton_forest);
+        gko::factorization::compute_elim_forest(dskeleton.get(),
+                                                dskeleton_forest);
+        // the parents array fully determines the elimination forest
+        GKO_ASSERT_ARRAY_EQ(forest->parents, skeleton_forest->parents);
+        GKO_ASSERT_ARRAY_EQ(skeleton_forest->parents,
+                            dskeleton_forest->parents);
     }
 }
 

From 7158492325d58697df13b7e3c81877396cc6b1e4 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 19 Dec 2024 21:12:35 +0100
Subject: [PATCH 09/26] add AMD support

---
 .../factorization/cholesky_kernels.cpp        | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 61a1012e435..54fa3c8caa9 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -58,6 +58,8 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
     IndexType* __restrict__ worklist_edge_ids,
     IndexType* __restrict__ worklist_counter)
 {
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -65,7 +67,8 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
     const auto row = rows[i];
     const auto col = cols[i];
     if (col < row) {
-        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
+        const auto out_i = static_cast<IndexType>(atomicAdd(
+            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
         worklist_sources[out_i] = row;
         worklist_targets[out_i] = col;
         worklist_edge_ids[out_i] = i;
@@ -100,9 +103,12 @@ __device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node)
 template <typename IndexType>
 __device__ void guarded_atomic_min(IndexType* ptr, IndexType value)
 {
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
     // only execute the atomic if we know that it might have an effect
     if (load_relaxed_local(ptr) > value) {
-        atomic_min_relaxed(ptr, value);
+        atomicMin(reinterpret_cast<atomic_type*>(ptr),
+                  static_cast<atomic_type>(value));
     }
 }
 
@@ -118,6 +124,8 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum(
     IndexType* __restrict__ worklist_edge_ids,
     IndexType* __restrict__ worklist_counter)
 {
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -128,7 +136,8 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum(
     const auto source_rep = mst_find(parents, source);
     const auto target_rep = mst_find(parents, target);
     if (source_rep != target_rep) {
-        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
+        const auto out_i = static_cast<IndexType>(atomicAdd(
+            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
         worklist_sources[out_i] = source_rep;
         worklist_targets[out_i] = target_rep;
         worklist_edge_ids[out_i] = edge_id;
@@ -149,6 +158,8 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
     IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
     IndexType* __restrict__ out_counter)
 {
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -166,9 +177,6 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
         bool repeat = false;
         do {
             repeat = false;
-            using atomic_type =
-                std::conditional_t<std::is_same_v<IndexType, int32>, int32,
-                                   unsigned long long>;
             auto old_parent =
                 atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
                           static_cast<atomic_type>(old_rep),
@@ -180,7 +188,8 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
                 repeat = true;
             }
         } while (repeat);
-        const auto out_i = atomic_add_relaxed(out_counter, 1);
+        const auto out_i = static_cast<IndexType>(atomicAdd(
+            reinterpret_cast<atomic_type*>(out_counter), atomic_type{1}));
         out_sources[out_i] = edge_sources[edge_id];
         out_targets[out_i] = edge_targets[edge_id];
     }

From 7d96d1d7dd6fb56dcb63679c1a72b5ce18506c7a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 19 Dec 2024 21:39:42 +0100
Subject: [PATCH 10/26] add benchmark for MST-enhanced symbolic Cholesky

---
 benchmark/sparse_blas/operations.cpp | 31 +++++++++++++----
 core/factorization/symbolic.cpp      | 51 ++++++++++++++++++++++++++++
 core/factorization/symbolic.hpp      | 16 +++++++++
 3 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp
index 61eeaeeb627..1b4e5819793 100644
--- a/benchmark/sparse_blas/operations.cpp
+++ b/benchmark/sparse_blas/operations.cpp
@@ -614,8 +614,9 @@ class SymbolicLuNearSymmOperation : public BenchmarkOperation {
 
 class SymbolicCholeskyOperation : public BenchmarkOperation {
 public:
-    explicit SymbolicCholeskyOperation(const Mtx* mtx, bool symmetric)
-        : mtx_{mtx}, symmetric_{symmetric}, result_{}
+    explicit SymbolicCholeskyOperation(const Mtx* mtx, bool device,
+                                       bool symmetric)
+        : mtx_{mtx}, symmetric_{symmetric}, device_{device}, result_{}
     {}
 
     std::pair<bool, double> validate() const override
@@ -643,8 +644,13 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 
     void run() override
     {
-        gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_,
-                                              forest_);
+        if (device_) {
+            gko::factorization::symbolic_cholesky_device(mtx_, symmetric_,
+                                                         result_, forest_);
+        } else {
+            gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_,
+                                                  forest_);
+        }
     }
 
     void write_stats(json& object) override
@@ -655,6 +661,7 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 private:
     const Mtx* mtx_;
     bool symmetric_;
+    bool device_;
     std::unique_ptr<Mtx> result_;
     std::unique_ptr<gko::factorization::elimination_forest<itype>> forest_;
 };
@@ -791,13 +798,25 @@ const std::map<std::string,
          [](const Mtx* mtx) {
              return std::make_unique<SymbolicLuNearSymmOperation>(mtx);
          }},
+        {"symbolic_cholesky_device",
+         [](const Mtx* mtx) {
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, true,
+                                                                false);
+         }},
+        {"symbolic_cholesky_device_symmetric",
+         [](const Mtx* mtx) {
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, true,
+                                                                true);
+         }},
         {"symbolic_cholesky",
          [](const Mtx* mtx) {
-             return std::make_unique<SymbolicCholeskyOperation>(mtx, false);
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, false,
+                                                                false);
          }},
         {"symbolic_cholesky_symmetric",
          [](const Mtx* mtx) {
-             return std::make_unique<SymbolicCholeskyOperation>(mtx, true);
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, false,
+                                                                true);
          }},
         {"reorder_rcm",
          [](const Mtx* mtx) {
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index b08602e1a1a..3d59ec78ac0 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -24,6 +24,7 @@ namespace factorization {
 namespace {
 
 
+GKO_REGISTER_OPERATION(compute_skeleton_tree, cholesky::compute_skeleton_tree);
 GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count);
 GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize);
 GKO_REGISTER_OPERATION(prefix_sum_nonnegative,
@@ -80,6 +81,56 @@ void symbolic_cholesky(
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);
 
 
+template <typename ValueType, typename IndexType>
+void symbolic_cholesky_device(
+    const matrix::Csr<ValueType, IndexType>* mtx, bool symmetrize,
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,
+    std::unique_ptr<elimination_forest<IndexType>>& forest)
+{
+    using matrix_type = matrix::Csr<ValueType, IndexType>;
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
+    const auto exec = mtx->get_executor();
+    const auto num_rows = mtx->get_size()[0];
+    const auto host_exec = exec->get_master();
+    {
+        const auto skeleton =
+            matrix_type::create(exec, mtx->get_size(), num_rows);
+        exec->run(make_compute_skeleton_tree(
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), num_rows,
+            skeleton->get_row_ptrs(), skeleton->get_col_idxs()));
+        exec->run(make_compute_elim_forest(skeleton.get(), forest));
+    }
+    array<IndexType> row_ptrs{exec, num_rows + 1};
+    array<IndexType> tmp{exec};
+    exec->run(make_symbolic_count(mtx, *forest, row_ptrs.get_data(), tmp));
+    exec->run(make_prefix_sum_nonnegative(row_ptrs.get_data(), num_rows + 1));
+    const auto factor_nnz =
+        static_cast<size_type>(get_element(row_ptrs, num_rows));
+    factors = matrix_type::create(
+        exec, mtx->get_size(), array<ValueType>{exec, factor_nnz},
+        array<IndexType>{exec, factor_nnz}, std::move(row_ptrs));
+    exec->run(make_symbolic(mtx, *forest, factors.get(), tmp));
+    factors->sort_by_column_index();
+    if (symmetrize) {
+        auto lt_factor = as<matrix_type>(factors->transpose());
+        const auto scalar =
+            initialize<matrix::Dense<ValueType>>({one<ValueType>()}, exec);
+        const auto id = matrix::Identity<ValueType>::create(exec, num_rows);
+        lt_factor->apply(scalar, id, scalar, factors);
+    }
+}
+
+
+#define GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE(ValueType, IndexType)     \
+    void symbolic_cholesky_device(                                     \
+        const matrix::Csr<ValueType, IndexType>* mtx, bool symmetrize, \
+        std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,   \
+        std::unique_ptr<factorization::elimination_forest<IndexType>>& forest)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_lu_near_symm(
     const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp
index 096d8c998bc..84478e214e6 100644
--- a/core/factorization/symbolic.hpp
+++ b/core/factorization/symbolic.hpp
@@ -25,6 +25,22 @@ void symbolic_cholesky(
     std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,
     std::unique_ptr<elimination_forest<IndexType>>& forest);
 
+
+/**
+ * Computes the symbolic Cholesky factorization of the given matrix on the
+ * device.
+ *
+ * @param mtx  the input matrix
+ * @param symmetrize  output the pattern of L + L^T (true) or just L (false)?
+ * @param factors  the output factor(s)
+ * @param forest  the elimination forest of the input matrix
+ */
+template <typename ValueType, typename IndexType>
+void symbolic_cholesky_device(
+    const matrix::Csr<ValueType, IndexType>* mtx, bool symmetrize,
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,
+    std::unique_ptr<elimination_forest<IndexType>>& forest);
+
 /**
  * Computes the symbolic LU factorization of the given matrix.
  *

From ef4b6f19d009288bc69059982246a72dd66c760c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 20 Dec 2024 20:24:00 +0100
Subject: [PATCH 11/26] extract elimination forest kernels into separate file

---
 common/cuda_hip/CMakeLists.txt                |   1 +
 .../factorization/cholesky_kernels.cpp        | 328 ---------------
 .../elimination_forest_kernels.cpp            | 381 ++++++++++++++++++
 core/device_hooks/common_kernels.inc.cpp      |  14 +-
 core/factorization/cholesky.cpp               |   5 +-
 core/factorization/cholesky_kernels.hpp       |  18 -
 .../elimination_forest_kernels.hpp            |  75 ++++
 core/factorization/ic.cpp                     |   6 +-
 core/factorization/symbolic.cpp               |   4 +-
 dpcpp/CMakeLists.txt                          |   1 +
 dpcpp/factorization/cholesky_kernels.dp.cpp   |  19 -
 .../elimination_forest_kernels.dp.cpp         |  46 +++
 omp/CMakeLists.txt                            |   1 +
 omp/factorization/cholesky_kernels.cpp        |  89 ----
 .../elimination_forest_kernels.cpp            | 118 ++++++
 reference/CMakeLists.txt                      |   1 +
 reference/factorization/cholesky_kernels.cpp  |  85 ----
 .../elimination_forest_kernels.cpp            | 116 ++++++
 .../test/factorization/cholesky_kernels.cpp   |   5 +-
 test/factorization/cholesky_kernels.cpp       |  13 +-
 20 files changed, 772 insertions(+), 554 deletions(-)
 create mode 100644 common/cuda_hip/factorization/elimination_forest_kernels.cpp
 create mode 100644 core/factorization/elimination_forest_kernels.hpp
 create mode 100644 dpcpp/factorization/elimination_forest_kernels.dp.cpp
 create mode 100644 omp/factorization/elimination_forest_kernels.cpp
 create mode 100644 reference/factorization/elimination_forest_kernels.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 5cfa55ca687..d7d1de30dea 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -12,6 +12,7 @@ set(CUDA_HIP_SOURCES
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
     factorization/factorization_kernels.cpp
+    factorization/elimination_forest_kernels.cpp
     factorization/ic_kernels.cpp
     factorization/ilu_kernels.cpp
     factorization/lu_kernels.cpp
diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 54fa3c8caa9..483144d8708 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -50,172 +50,6 @@ constexpr int default_block_size = 512;
 namespace kernel {
 
 
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
-    const IndexType* __restrict__ rows, const IndexType* __restrict__ cols,
-    IndexType size, IndexType* __restrict__ worklist_sources,
-    IndexType* __restrict__ worklist_targets,
-    IndexType* __restrict__ worklist_edge_ids,
-    IndexType* __restrict__ worklist_counter)
-{
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
-    const auto i = thread::get_thread_id_flat<IndexType>();
-    if (i >= size) {
-        return;
-    }
-    const auto row = rows[i];
-    const auto col = cols[i];
-    if (col < row) {
-        const auto out_i = static_cast<IndexType>(atomicAdd(
-            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
-        worklist_sources[out_i] = row;
-        worklist_targets[out_i] = col;
-        worklist_edge_ids[out_i] = i;
-    }
-}
-
-
-template <typename IndexType>
-__device__ IndexType mst_find(const IndexType* parents, IndexType node)
-{
-    auto parent = parents[node];
-    while (parent != node) {
-        node = parent;
-        parent = parents[node];
-    };
-    return parent;
-}
-
-
-template <typename IndexType>
-__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node)
-{
-    auto parent = load_relaxed_local(parents + node);
-    while (parent != node) {
-        node = parent;
-        parent = load_relaxed_local(parents + node);
-    };
-    return parent;
-}
-
-
-template <typename IndexType>
-__device__ void guarded_atomic_min(IndexType* ptr, IndexType value)
-{
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
-    // only execute the atomic if we know that it might have an effect
-    if (load_relaxed_local(ptr) > value) {
-        atomicMin(reinterpret_cast<atomic_type*>(ptr),
-                  static_cast<atomic_type>(value));
-    }
-}
-
-
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void mst_find_minimum(
-    const IndexType* __restrict__ in_sources,
-    const IndexType* __restrict__ in_targets,
-    const IndexType* __restrict__ in_edge_ids, IndexType size,
-    const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge,
-    IndexType* __restrict__ worklist_sources,
-    IndexType* __restrict__ worklist_targets,
-    IndexType* __restrict__ worklist_edge_ids,
-    IndexType* __restrict__ worklist_counter)
-{
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
-    const auto i = thread::get_thread_id_flat<IndexType>();
-    if (i >= size) {
-        return;
-    }
-    const auto source = in_sources[i];
-    const auto target = in_targets[i];
-    const auto edge_id = in_edge_ids[i];
-    const auto source_rep = mst_find(parents, source);
-    const auto target_rep = mst_find(parents, target);
-    if (source_rep != target_rep) {
-        const auto out_i = static_cast<IndexType>(atomicAdd(
-            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
-        worklist_sources[out_i] = source_rep;
-        worklist_targets[out_i] = target_rep;
-        worklist_edge_ids[out_i] = edge_id;
-        guarded_atomic_min(min_edge + source_rep, edge_id);
-        guarded_atomic_min(min_edge + target_rep, edge_id);
-    }
-}
-
-
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void mst_join_edges(
-    const IndexType* __restrict__ in_sources,
-    const IndexType* __restrict__ in_targets,
-    const IndexType* __restrict__ in_edge_ids, IndexType size,
-    IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge,
-    const IndexType* __restrict__ edge_sources,
-    const IndexType* __restrict__ edge_targets,
-    IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
-    IndexType* __restrict__ out_counter)
-{
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
-    const auto i = thread::get_thread_id_flat<IndexType>();
-    if (i >= size) {
-        return;
-    }
-    const auto source = in_sources[i];
-    const auto target = in_targets[i];
-    const auto edge_id = in_edge_ids[i];
-    if (min_edge[source] == edge_id || min_edge[target] == edge_id) {
-        // join source and sink
-        const auto source_rep = mst_find_relaxed(parents, source);
-        const auto target_rep = mst_find_relaxed(parents, target);
-        assert(source_rep != target_rep);
-        const auto new_rep = min(source_rep, target_rep);
-        auto old_rep = max(source_rep, target_rep);
-        bool repeat = false;
-        do {
-            repeat = false;
-            auto old_parent =
-                atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
-                          static_cast<atomic_type>(old_rep),
-                          static_cast<atomic_type>(new_rep));
-            // if this fails, the parent of old_rep changed recently, so we need
-            // to try again by updating the parent's parent (hopefully its rep)
-            if (old_parent != old_rep) {
-                old_rep = old_parent;
-                repeat = true;
-            }
-        } while (repeat);
-        const auto out_i = static_cast<IndexType>(atomicAdd(
-            reinterpret_cast<atomic_type*>(out_counter), atomic_type{1}));
-        out_sources[out_i] = edge_sources[edge_id];
-        out_targets[out_i] = edge_targets[edge_id];
-    }
-}
-
-
-template <typename IndexType>
-__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges(
-    const IndexType* __restrict__ in_sources,
-    const IndexType* __restrict__ in_targets, IndexType size,
-    IndexType* __restrict__ min_edge)
-{
-    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
-    const auto i = thread::get_thread_id_flat<IndexType>();
-    if (i >= size) {
-        return;
-    }
-    const auto source = in_sources[i];
-    const auto target = in_targets[i];
-    // we could write the values non-atomically, but this makes race checkers
-    // happier without a performance penalty (hopefully, thanks to _local)
-    store_relaxed_local(min_edge + source, sentinel);
-    store_relaxed_local(min_edge + target, sentinel);
-}
-
-
 template <typename IndexType>
 __global__ __launch_bounds__(default_block_size) void build_postorder_cols(
     IndexType num_rows, const IndexType* cols, const IndexType* row_ptrs,
@@ -398,109 +232,6 @@ __global__ __launch_bounds__(default_block_size) void factorize(
 }  // namespace kernel
 
 
-template <typename IndexType>
-void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
-                           const IndexType* row_ptrs, const IndexType* cols,
-                           size_type size, IndexType* out_row_ptrs,
-                           IndexType* out_cols)
-{
-    const auto policy = thrust_policy(exec);
-    const auto nnz = exec->copy_val_to_host(row_ptrs + size);
-    // convert edges to COO representation
-    // the edge list is sorted, since we only consider edges where row > col,
-    // and the row array (= weights) is sorted coming from row_ptrs
-    array<IndexType> row_array{exec, static_cast<size_type>(nnz)};
-    const auto rows = row_array.get_data();
-    components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows);
-    // we assume the matrix is symmetric, so we can remove every second edge
-    // also round up the worklist size for equal cache alignment between fields
-    const auto worklist_size =
-        ceildiv(nnz, config::warp_size * 2) * config::warp_size;
-    // create 2 worklists consisting of (start, end, edge_id)
-    array<IndexType> worklist{exec, static_cast<size_type>(worklist_size * 8)};
-    auto wl1_source = worklist.get_data();
-    auto wl1_target = wl1_source + worklist_size;
-    auto wl1_edge_id = wl1_target + worklist_size;
-    auto wl2_source = wl1_source + 3 * worklist_size;
-    auto wl2_target = wl1_target + 3 * worklist_size;
-    auto wl2_edge_id = wl1_edge_id + 3 * worklist_size;
-    // atomic counters for worklists and output edge list
-    array<IndexType> counters{exec, 3};
-    auto wl1_counter = counters.get_data();
-    auto wl2_counter = wl1_counter + 1;
-    auto output_counter = wl2_counter + 1;
-    components::fill_array(exec, counters.get_data(), counters.get_size(),
-                           IndexType{});
-    // helpers for interacting with worklists
-    const auto clear_wl1 = [&] {
-        IndexType value{};
-        exec->copy_from(exec->get_master(), 1, &value, wl1_counter);
-    };
-    const auto clear_wl2 = [&] {
-        IndexType value{};
-        exec->copy_from(exec->get_master(), 1, &value, wl2_counter);
-    };
-    const auto get_wl1_size = [&] {
-        return exec->copy_val_to_host(wl1_counter);
-    };
-    const auto swap_wl1_wl2 = [&] {
-        std::swap(wl1_source, wl2_source);
-        std::swap(wl1_target, wl2_target);
-        std::swap(wl1_edge_id, wl2_edge_id);
-        std::swap(wl1_counter, wl2_counter);
-    };
-    // initialize every node to a singleton set
-    array<IndexType> parent_array{exec, size};
-    const auto parents = parent_array.get_data();
-    components::fill_seq_array(exec, parents, size);
-    // array storing the minimal edge adjacent to each node
-    array<IndexType> min_edge_array{exec, size};
-    const auto min_edges = min_edge_array.get_data();
-    constexpr auto min_edge_sentinel = std::numeric_limits<IndexType>::max();
-    components::fill_array(exec, min_edges, size, min_edge_sentinel);
-    // output row array, to be used in conjunction with out_cols in COO storage
-    array<IndexType> out_row_array{exec, size};
-    const auto out_rows = out_row_array.get_data();
-    // initialize worklist1 with forward edges
-    {
-        const auto num_blocks = ceildiv(nnz, default_block_size);
-        kernel::mst_initialize_worklist<<<num_blocks, default_block_size>>>(
-            rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter);
-    }
-    auto wl1_size = get_wl1_size();
-    while (wl1_size > 0) {
-        clear_wl2();
-        // attach each node to its smallest adjacent non-cycle edge
-        {
-            const auto num_blocks = ceildiv(wl1_size, default_block_size);
-            kernel::mst_find_minimum<<<num_blocks, default_block_size>>>(
-                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
-                min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter);
-        }
-        clear_wl1();
-        swap_wl1_wl2();
-        wl1_size = get_wl1_size();
-        if (wl1_size > 0) {
-            // join minimal edges
-            const auto num_blocks = ceildiv(wl1_size, default_block_size);
-            kernel::mst_join_edges<<<num_blocks, default_block_size>>>(
-                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
-                min_edges, rows, cols, out_rows, out_cols, output_counter);
-            kernel::mst_reset_min_edges<<<num_blocks, default_block_size>>>(
-                wl1_source, wl1_target, wl1_size, min_edges);
-        }
-    }
-    const auto num_mst_edges = exec->copy_val_to_host(output_counter);
-    thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows);
-    thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges,
-                               out_cols);
-    components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size,
-                                     out_row_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
-
-
 template <typename ValueType, typename IndexType>
 void symbolic_factorize(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -534,65 +265,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename IndexType>
-void build_children_from_parents(
-    std::shared_ptr<const DefaultExecutor> exec,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto num_rows = forest.parents.get_size();
-    // build COO representation of the tree
-    array<IndexType> col_idx_array{exec, num_rows};
-    const auto col_idxs = col_idx_array.get_data();
-    const auto parents = forest.parents.get_const_data();
-    const auto children = forest.children.get_data();
-    const auto child_ptrs = forest.child_ptrs.get_data();
-    exec->copy(num_rows, parents, col_idxs);
-    thrust::sequence(thrust_policy(exec), children, children + num_rows,
-                     IndexType{});
-    // group by parent
-    thrust::stable_sort_by_key(thrust_policy(exec), col_idxs,
-                               col_idxs + num_rows, children);
-    // create child pointers for groups of children
-    components::convert_idxs_to_ptrs(exec, col_idxs, num_rows,
-                                     num_rows + 1,  // rows plus sentinel
-                                     child_ptrs);
-}
-
-
-template <typename ValueType, typename IndexType>
-void forest_from_factor(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* factors,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto num_rows = factors->get_size()[0];
-    const auto it = thrust::make_counting_iterator(IndexType{});
-    thrust::transform(
-        thrust_policy(exec), it, it + num_rows, forest.parents.get_data(),
-        [row_ptrs = factors->get_const_row_ptrs(),
-         col_idxs = factors->get_const_col_idxs(),
-         num_rows] __device__(IndexType l_col) {
-            const auto llt_row_begin = row_ptrs[l_col];
-            const auto llt_row_end = row_ptrs[l_col + 1];
-            for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
-                const auto l_row = col_idxs[nz];
-                // parent[j] = min(i | i > j and l_ij =/= 0)
-                // we read from L^T stored above the diagonal in factors
-                // assuming a sorted order of the columns
-                if (l_row > l_col) {
-                    return l_row;
-                }
-            }
-            // sentinel pseudo-root
-            return static_cast<IndexType>(num_rows);
-        });
-    build_children_from_parents(exec, forest);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
new file mode 100644
index 00000000000..6291b7628bc
--- /dev/null
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -0,0 +1,381 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/syncfree.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/cholesky_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace elimination_forest {
+
+
+constexpr int default_block_size = 512;
+
+
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
+    const IndexType* __restrict__ rows, const IndexType* __restrict__ cols,
+    IndexType size, IndexType* __restrict__ worklist_sources,
+    IndexType* __restrict__ worklist_targets,
+    IndexType* __restrict__ worklist_edge_ids,
+    IndexType* __restrict__ worklist_counter)
+{
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto row = rows[i];
+    const auto col = cols[i];
+    if (col < row) {
+        const auto out_i = static_cast<IndexType>(atomicAdd(
+            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
+        worklist_sources[out_i] = row;
+        worklist_targets[out_i] = col;
+        worklist_edge_ids[out_i] = i;
+    }
+}
+
+
+template <typename IndexType>
+__device__ IndexType mst_find(const IndexType* parents, IndexType node)
+{
+    auto parent = parents[node];
+    while (parent != node) {
+        node = parent;
+        parent = parents[node];
+    };
+    return parent;
+}
+
+
+template <typename IndexType>
+__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node)
+{
+    auto parent = load_relaxed_local(parents + node);
+    while (parent != node) {
+        node = parent;
+        parent = load_relaxed_local(parents + node);
+    };
+    return parent;
+}
+
+
+template <typename IndexType>
+__device__ void guarded_atomic_min(IndexType* ptr, IndexType value)
+{
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
+    // only execute the atomic if we know that it might have an effect
+    if (load_relaxed_local(ptr) > value) {
+        atomicMin(reinterpret_cast<atomic_type*>(ptr),
+                  static_cast<atomic_type>(value));
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_find_minimum(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets,
+    const IndexType* __restrict__ in_edge_ids, IndexType size,
+    const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge,
+    IndexType* __restrict__ worklist_sources,
+    IndexType* __restrict__ worklist_targets,
+    IndexType* __restrict__ worklist_edge_ids,
+    IndexType* __restrict__ worklist_counter)
+{
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    const auto edge_id = in_edge_ids[i];
+    const auto source_rep = mst_find(parents, source);
+    const auto target_rep = mst_find(parents, target);
+    if (source_rep != target_rep) {
+        const auto out_i = static_cast<IndexType>(atomicAdd(
+            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
+        worklist_sources[out_i] = source_rep;
+        worklist_targets[out_i] = target_rep;
+        worklist_edge_ids[out_i] = edge_id;
+        guarded_atomic_min(min_edge + source_rep, edge_id);
+        guarded_atomic_min(min_edge + target_rep, edge_id);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_join_edges(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets,
+    const IndexType* __restrict__ in_edge_ids, IndexType size,
+    IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge,
+    const IndexType* __restrict__ edge_sources,
+    const IndexType* __restrict__ edge_targets,
+    IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
+    IndexType* __restrict__ out_counter)
+{
+    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
+                                           int32, unsigned long long>;
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    const auto edge_id = in_edge_ids[i];
+    if (min_edge[source] == edge_id || min_edge[target] == edge_id) {
+        // join source and sink
+        const auto source_rep = mst_find_relaxed(parents, source);
+        const auto target_rep = mst_find_relaxed(parents, target);
+        assert(source_rep != target_rep);
+        const auto new_rep = min(source_rep, target_rep);
+        auto old_rep = max(source_rep, target_rep);
+        bool repeat = false;
+        do {
+            repeat = false;
+            auto old_parent =
+                atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
+                          static_cast<atomic_type>(old_rep),
+                          static_cast<atomic_type>(new_rep));
+            // if this fails, the parent of old_rep changed recently, so we need
+            // to try again by updating the parent's parent (hopefully its rep)
+            if (old_parent != old_rep) {
+                old_rep = old_parent;
+                repeat = true;
+            }
+        } while (repeat);
+        const auto out_i = static_cast<IndexType>(atomicAdd(
+            reinterpret_cast<atomic_type*>(out_counter), atomic_type{1}));
+        out_sources[out_i] = edge_sources[edge_id];
+        out_targets[out_i] = edge_targets[edge_id];
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets, IndexType size,
+    IndexType* __restrict__ min_edge)
+{
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    // we could write the values non-atomically, but this makes race checkers
+    // happier without a performance penalty (hopefully, thanks to _local)
+    store_relaxed_local(min_edge + source, sentinel);
+    store_relaxed_local(min_edge + target, sentinel);
+}
+
+
+}  // namespace kernel
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    const auto policy = thrust_policy(exec);
+    const auto nnz = exec->copy_val_to_host(row_ptrs + size);
+    // convert edges to COO representation
+    // the edge list is sorted, since we only consider edges where row > col,
+    // and the row array (= weights) is sorted coming from row_ptrs
+    array<IndexType> row_array{exec, static_cast<size_type>(nnz)};
+    const auto rows = row_array.get_data();
+    components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows);
+    // we assume the matrix is symmetric, so we can remove every second edge
+    // also round up the worklist size for equal cache alignment between fields
+    const auto worklist_size =
+        ceildiv(nnz, config::warp_size * 2) * config::warp_size;
+    // create 2 worklists consisting of (start, end, edge_id)
+    array<IndexType> worklist{exec, static_cast<size_type>(worklist_size * 8)};
+    auto wl1_source = worklist.get_data();
+    auto wl1_target = wl1_source + worklist_size;
+    auto wl1_edge_id = wl1_target + worklist_size;
+    auto wl2_source = wl1_source + 3 * worklist_size;
+    auto wl2_target = wl1_target + 3 * worklist_size;
+    auto wl2_edge_id = wl1_edge_id + 3 * worklist_size;
+    // atomic counters for worklists and output edge list
+    array<IndexType> counters{exec, 3};
+    auto wl1_counter = counters.get_data();
+    auto wl2_counter = wl1_counter + 1;
+    auto output_counter = wl2_counter + 1;
+    components::fill_array(exec, counters.get_data(), counters.get_size(),
+                           IndexType{});
+    // helpers for interacting with worklists
+    const auto clear_wl1 = [&] {
+        IndexType value{};
+        exec->copy_from(exec->get_master(), 1, &value, wl1_counter);
+    };
+    const auto clear_wl2 = [&] {
+        IndexType value{};
+        exec->copy_from(exec->get_master(), 1, &value, wl2_counter);
+    };
+    const auto get_wl1_size = [&] {
+        return exec->copy_val_to_host(wl1_counter);
+    };
+    const auto swap_wl1_wl2 = [&] {
+        std::swap(wl1_source, wl2_source);
+        std::swap(wl1_target, wl2_target);
+        std::swap(wl1_edge_id, wl2_edge_id);
+        std::swap(wl1_counter, wl2_counter);
+    };
+    // initialize every node to a singleton set
+    array<IndexType> parent_array{exec, size};
+    const auto parents = parent_array.get_data();
+    components::fill_seq_array(exec, parents, size);
+    // array storing the minimal edge adjacent to each node
+    array<IndexType> min_edge_array{exec, size};
+    const auto min_edges = min_edge_array.get_data();
+    constexpr auto min_edge_sentinel = std::numeric_limits<IndexType>::max();
+    components::fill_array(exec, min_edges, size, min_edge_sentinel);
+    // output row array, to be used in conjunction with out_cols in COO storage
+    array<IndexType> out_row_array{exec, size};
+    const auto out_rows = out_row_array.get_data();
+    // initialize worklist1 with forward edges
+    {
+        const auto num_blocks = ceildiv(nnz, default_block_size);
+        kernel::mst_initialize_worklist<<<num_blocks, default_block_size>>>(
+            rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter);
+    }
+    auto wl1_size = get_wl1_size();
+    while (wl1_size > 0) {
+        clear_wl2();
+        // attach each node to its smallest adjacent non-cycle edge
+        {
+            const auto num_blocks = ceildiv(wl1_size, default_block_size);
+            kernel::mst_find_minimum<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
+                min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter);
+        }
+        clear_wl1();
+        swap_wl1_wl2();
+        wl1_size = get_wl1_size();
+        if (wl1_size > 0) {
+            // join minimal edges
+            const auto num_blocks = ceildiv(wl1_size, default_block_size);
+            kernel::mst_join_edges<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
+                min_edges, rows, cols, out_rows, out_cols, output_counter);
+            kernel::mst_reset_min_edges<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_size, min_edges);
+        }
+    }
+    const auto num_mst_edges = exec->copy_val_to_host(output_counter);
+    thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows);
+    thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges,
+                               out_cols);
+    components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename IndexType>
+void build_children_from_parents(
+    std::shared_ptr<const DefaultExecutor> exec,
+    gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto num_rows = forest.parents.get_size();
+    // build COO representation of the tree
+    array<IndexType> col_idx_array{exec, num_rows};
+    const auto col_idxs = col_idx_array.get_data();
+    const auto parents = forest.parents.get_const_data();
+    const auto children = forest.children.get_data();
+    const auto child_ptrs = forest.child_ptrs.get_data();
+    exec->copy(num_rows, parents, col_idxs);
+    thrust::sequence(thrust_policy(exec), children, children + num_rows,
+                     IndexType{});
+    // group by parent
+    thrust::stable_sort_by_key(thrust_policy(exec), col_idxs,
+                               col_idxs + num_rows, children);
+    // create child pointers for groups of children
+    components::convert_idxs_to_ptrs(exec, col_idxs, num_rows,
+                                     num_rows + 1,  // rows plus sentinel
+                                     child_ptrs);
+}
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto num_rows = factors->get_size()[0];
+    const auto it = thrust::make_counting_iterator(IndexType{});
+    thrust::transform(
+        thrust_policy(exec), it, it + num_rows, forest.parents.get_data(),
+        [row_ptrs = factors->get_const_row_ptrs(),
+         col_idxs = factors->get_const_col_idxs(),
+         num_rows] __device__(IndexType l_col) {
+            const auto llt_row_begin = row_ptrs[l_col];
+            const auto llt_row_end = row_ptrs[l_col + 1];
+            for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
+                const auto l_row = col_idxs[nz];
+                // parent[j] = min(i | i > j and l_ij =/= 0)
+                // we read from L^T stored above the diagonal in factors
+                // assuming a sorted order of the columns
+                if (l_row > l_col) {
+                    return l_row;
+                }
+            }
+            // sentinel pseudo-root
+            return static_cast<IndexType>(num_rows);
+        });
+    build_children_from_parents(exec, forest);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 668a4e0f6bf..19c9917ed4b 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -23,6 +23,7 @@
 #include "core/distributed/partition_kernels.hpp"
 #include "core/distributed/vector_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ic_kernels.hpp"
 #include "core/factorization/ilu_kernels.hpp"
@@ -927,10 +928,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
 namespace cholesky {
 
 
-GKO_STUB_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
@@ -938,6 +937,17 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 }  // namespace cholesky
 
 
+namespace elimination_forest {
+
+
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+}  // namespace elimination_forest
+
+
 namespace factorization {
 
 
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index ce189e3bbdc..2f8d8630a1e 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -15,6 +15,7 @@
 #include "core/config/config_helper.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -27,7 +28,7 @@ namespace {
 
 
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
-GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor);
+GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor);
 GKO_REGISTER_OPERATION(initialize, cholesky::initialize);
 GKO_REGISTER_OPERATION(factorize, cholesky::factorize);
 
@@ -102,7 +103,7 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
         forest =
             std::make_unique<gko::factorization::elimination_forest<IndexType>>(
                 exec, num_rows);
-        exec->run(make_forest_from_factor(factors.get(), *forest));
+        exec->run(make_from_factor(factors.get(), *forest));
     }
     // setup lookup structure on factors
     const auto lookup = matrix::csr::build_lookup(factors.get());
diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp
index 9a9a165e6b4..f80e057da3a 100644
--- a/core/factorization/cholesky_kernels.hpp
+++ b/core/factorization/cholesky_kernels.hpp
@@ -20,13 +20,6 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType)               \
-    void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec, \
-                               const IndexType* row_ptrs,                   \
-                               const IndexType* cols, size_type size,       \
-                               IndexType* out_row_ptrs, IndexType* out_cols)
-
-
 #define GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType)        \
     void symbolic_count(                                                 \
         std::shared_ptr<const DefaultExecutor> exec,                     \
@@ -44,13 +37,6 @@ namespace kernels {
         const array<IndexType>& tmp_storage)
 
 
-#define GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType) \
-    void forest_from_factor(                                          \
-        std::shared_ptr<const DefaultExecutor> exec,                  \
-        const matrix::Csr<ValueType, IndexType>* factors,             \
-        gko::factorization::elimination_forest<IndexType>& forest)
-
-
 #define GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType)                 \
     void initialize(std::shared_ptr<const DefaultExecutor> exec,              \
                     const matrix::Csr<ValueType, IndexType>* mtx,             \
@@ -73,15 +59,11 @@ namespace kernels {
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                               \
-    template <typename IndexType>                                  \
-    GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE(IndexType);         \
     template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT(ValueType, IndexType);     \
     template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>              \
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType); \
-    template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType);         \
     template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_FACTORIZE(ValueType, IndexType)
diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp
new file mode 100644
index 00000000000..8d76ee73b54
--- /dev/null
+++ b/core/factorization/elimination_forest_kernels.hpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_
+#define GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_
+
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <memory>
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType)     \
+    void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec, \
+                               const IndexType* row_ptrs,                   \
+                               const IndexType* cols, size_type size,       \
+                               IndexType* out_row_ptrs, IndexType* out_cols)
+
+
+#define GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType) \
+    void from_factor(                                                    \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        const matrix::Csr<ValueType, IndexType>* factors,                \
+        gko::factorization::elimination_forest<IndexType>& forest)
+
+
+#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType)  \
+    void compute_subtree_sizes(                                          \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        const gko::factorization::elimination_forest<IndexType>& forest, \
+        IndexType* subtree_size)
+
+
+#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType)      \
+    void compute_postorder(                                              \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        const gko::factorization::elimination_forest<IndexType>& forest, \
+        const IndexType* subtree_size, IndexType* postorder,             \
+        IndexType* inv_postorder)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                  \
+    template <typename IndexType>                                     \
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType);  \
+    template <typename ValueType, typename IndexType>                 \
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType); \
+    template <typename IndexType>                                     \
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType);  \
+    template <typename IndexType>                                     \
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(elimination_forest,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 46d45ba5f04..3eb14e05a4a 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -18,6 +18,7 @@
 #include "core/config/config_helper.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ic_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -37,7 +38,7 @@ GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
 GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
 // for gko syncfree implementation
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
-GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor);
+GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor);
 GKO_REGISTER_OPERATION(initialize, cholesky::initialize);
 GKO_REGISTER_OPERATION(factorize, cholesky::factorize);
 
@@ -119,8 +120,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
         forest =
             std::make_unique<gko::factorization::elimination_forest<IndexType>>(
                 exec, num_rows);
-        exec->run(
-            ic_factorization::make_forest_from_factor(factors.get(), *forest));
+        exec->run(ic_factorization::make_from_factor(factors.get(), *forest));
 
         // setup lookup structure on factors
         const auto lookup = matrix::csr::build_lookup(factors.get());
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index 3d59ec78ac0..9c7e1785396 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -15,6 +15,7 @@
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
@@ -24,7 +25,8 @@ namespace factorization {
 namespace {
 
 
-GKO_REGISTER_OPERATION(compute_skeleton_tree, cholesky::compute_skeleton_tree);
+GKO_REGISTER_OPERATION(compute_skeleton_tree,
+                       elimination_forest::compute_skeleton_tree);
 GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count);
 GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize);
 GKO_REGISTER_OPERATION(prefix_sum_nonnegative,
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 81a2a6034ea..a5fc043e171 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -27,6 +27,7 @@ target_sources(ginkgo_dpcpp
     distributed/partition_kernels.dp.cpp
     distributed/vector_kernels.dp.cpp
     factorization/cholesky_kernels.dp.cpp
+    factorization/elimination_forest_kernels.dp.cpp
     factorization/ic_kernels.dp.cpp
     factorization/ilu_kernels.dp.cpp
     factorization/factorization_kernels.dp.cpp
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index 1461f918be7..50bdbceaa75 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -25,15 +25,6 @@ namespace dpcpp {
 namespace cholesky {
 
 
-template <typename IndexType>
-void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
-                           const IndexType* row_ptrs, const IndexType* cols,
-                           size_type size, IndexType* out_row_ptrs,
-                           IndexType* out_cols) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
-
-
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
@@ -151,16 +142,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename ValueType, typename IndexType>
-void forest_from_factor(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Csr<ValueType, IndexType>* factors,
-                        gko::factorization::elimination_forest<IndexType>&
-                            forest) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
new file mode 100644
index 00000000000..4aecc0993f9
--- /dev/null
+++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include <sycl/sycl.hpp>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace climination_forest {
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace climination_forest
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 700a01116c9..31f369356f8 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -17,6 +17,7 @@ target_sources(ginkgo_omp
     distributed/partition_kernels.cpp
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
+    factorization/elimination_forest_kernels.cpp
     factorization/factorization_kernels.cpp
     factorization/ic_kernels.cpp
     factorization/ilu_kernels.cpp
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index 293bb70ff8b..76b1154512b 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -30,52 +30,6 @@ namespace omp {
 namespace cholesky {
 
 
-template <typename IndexType>
-void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
-                           const IndexType* row_ptrs, const IndexType* cols,
-                           size_type size, IndexType* out_row_ptrs,
-                           IndexType* out_cols)
-{
-    disjoint_sets<IndexType> sets(exec, size);
-    const auto nnz = static_cast<size_type>(row_ptrs[size]);
-    vector<std::pair<IndexType, IndexType>> edges(nnz, exec);
-    // collect edge list
-    for (auto row : irange(static_cast<IndexType>(size))) {
-        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
-            const auto col = cols[nz];
-            // edge contains (max, min) pair
-            auto edge = std::minmax(row, col, std::greater{});
-            edges.push_back(edge);
-        }
-    }
-    // sort edge list ascending by edge weight
-    std::sort(edges.begin(), edges.end());
-    // output helper array: Store row indices for output rows
-    // since we sorted by edge.first == row, this will be sorted
-    std::vector<IndexType> out_rows(size);
-    IndexType output_count{};
-    // Kruskal algorithm: Connect unconnected components using edges with
-    // ascending weight
-    for (const auto edge : edges) {
-        const auto first_rep = sets.find(edge.first);
-        const auto second_rep = sets.find(edge.second);
-        if (first_rep != second_rep) {
-            // we are only interested in the lower triangle, so we add an edge
-            // max -> min
-            out_rows[output_count] = edge.first;
-            out_cols[output_count] = edge.second;
-            output_count++;
-            sets.join(first_rep, second_rep);
-        }
-    }
-    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
-    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
-                                     out_row_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
-
-
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
@@ -178,49 +132,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename ValueType, typename IndexType>
-void forest_from_factor(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* factors,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto row_ptrs = factors->get_const_row_ptrs();
-    const auto col_idxs = factors->get_const_col_idxs();
-    const auto parents = forest.parents.get_data();
-    const auto children = forest.children.get_data();
-    const auto child_ptrs = forest.child_ptrs.get_data();
-    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
-    components::fill_array(exec, parents, num_rows, num_rows);
-#pragma omp parallel for
-    for (IndexType l_col = 0; l_col < num_rows; l_col++) {
-        const auto llt_row_begin = row_ptrs[l_col];
-        const auto llt_row_end = row_ptrs[l_col + 1];
-        for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
-            const auto l_row = col_idxs[nz];
-            // parent[j] = min(i | i > j and l_ij =/= 0)
-            // we read from L^T stored above the diagonal in factors
-            // assuming a sorted order of the columns
-            if (l_row > l_col) {
-                parents[l_col] = l_row;
-                break;
-            }
-        }
-    }
-    // group by parent
-    array<IndexType> parents_copy{exec, static_cast<size_type>(num_rows)};
-    exec->copy(num_rows, parents, parents_copy.get_data());
-    components::fill_seq_array(exec, children, num_rows);
-    const auto it =
-        detail::make_zip_iterator(parents_copy.get_data(), children);
-    std::stable_sort(it, it + num_rows);
-    components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(),
-                                     num_rows, num_rows + 1, child_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp
new file mode 100644
index 00000000000..e150877e3a2
--- /dev/null
+++ b/omp/factorization/elimination_forest_kernels.cpp
@@ -0,0 +1,118 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace elimination_forest {
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    disjoint_sets<IndexType> sets(exec, size);
+    const auto nnz = static_cast<size_type>(row_ptrs[size]);
+    vector<std::pair<IndexType, IndexType>> edges(nnz, exec);
+    // collect edge list
+    for (auto row : irange(static_cast<IndexType>(size))) {
+        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
+            const auto col = cols[nz];
+            // edge contains (max, min) pair
+            auto edge = std::minmax(row, col, std::greater{});
+            edges.push_back(edge);
+        }
+    }
+    // sort edge list ascending by edge weight
+    std::sort(edges.begin(), edges.end());
+    // output helper array: Store row indices for output rows
+    // since we sorted by edge.first == row, this will be sorted
+    std::vector<IndexType> out_rows(size);
+    IndexType output_count{};
+    // Kruskal algorithm: Connect unconnected components using edges with
+    // ascending weight
+    for (const auto edge : edges) {
+        const auto first_rep = sets.find(edge.first);
+        const auto second_rep = sets.find(edge.second);
+        if (first_rep != second_rep) {
+            // we are only interested in the lower triangle, so we add an edge
+            // max -> min
+            out_rows[output_count] = edge.first;
+            out_cols[output_count] = edge.second;
+            output_count++;
+            sets.join(first_rep, second_rep);
+        }
+    }
+    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
+    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto row_ptrs = factors->get_const_row_ptrs();
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto parents = forest.parents.get_data();
+    const auto children = forest.children.get_data();
+    const auto child_ptrs = forest.child_ptrs.get_data();
+    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
+    components::fill_array(exec, parents, num_rows, num_rows);
+#pragma omp parallel for
+    for (IndexType l_col = 0; l_col < num_rows; l_col++) {
+        const auto llt_row_begin = row_ptrs[l_col];
+        const auto llt_row_end = row_ptrs[l_col + 1];
+        for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
+            const auto l_row = col_idxs[nz];
+            // parent[j] = min(i | i > j and l_ij =/= 0)
+            // we read from L^T stored above the diagonal in factors
+            // assuming a sorted order of the columns
+            if (l_row > l_col) {
+                parents[l_col] = l_row;
+                break;
+            }
+        }
+    }
+    // group by parent
+    array<IndexType> parents_copy{exec, static_cast<size_type>(num_rows)};
+    exec->copy(num_rows, parents, parents_copy.get_data());
+    components::fill_seq_array(exec, children, num_rows);
+    const auto it =
+        detail::make_zip_iterator(parents_copy.get_data(), children);
+    std::stable_sort(it, it + num_rows);
+    components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(),
+                                     num_rows, num_rows + 1, child_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 94e61d43d5c..27c04a4db09 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -19,6 +19,7 @@ target_sources(ginkgo_reference
     distributed/partition_kernels.cpp
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
+    factorization/elimination_forest_kernels.cpp
     factorization/factorization_kernels.cpp
     factorization/ic_kernels.cpp
     factorization/ilu_kernels.cpp
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index 94d8903ca07..e489a3a2c57 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -32,53 +32,6 @@ namespace reference {
 namespace cholesky {
 
 
-template <typename IndexType>
-void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
-                           const IndexType* row_ptrs, const IndexType* cols,
-                           size_type size, IndexType* out_row_ptrs,
-                           IndexType* out_cols)
-{
-    disjoint_sets<IndexType> sets(exec, size);
-    const auto nnz = static_cast<size_type>(row_ptrs[size]);
-    vector<std::pair<IndexType, IndexType>> edges(exec);
-    edges.reserve(nnz);
-    // collect edge list
-    for (auto row : irange(static_cast<IndexType>(size))) {
-        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
-            const auto col = cols[nz];
-            // edge contains (max, min) pair
-            auto edge = std::minmax(row, col, std::greater{});
-            edges.push_back(edge);
-        }
-    }
-    // sort edge list ascending by edge weight
-    std::sort(edges.begin(), edges.end());
-    // output helper array: Store row indices for output rows
-    // since we sorted by edge.first == row, this will be sorted
-    std::vector<IndexType> out_rows(size);
-    IndexType output_count{};
-    // Kruskal algorithm: Connect unconnected components using edges with
-    // ascending weight
-    for (const auto edge : edges) {
-        const auto first_rep = sets.find(edge.first);
-        const auto second_rep = sets.find(edge.second);
-        if (first_rep != second_rep) {
-            // we are only interested in the lower triangle, so we add an edge
-            // max -> min
-            out_rows[output_count] = edge.first;
-            out_cols[output_count] = edge.second;
-            output_count++;
-            sets.join(first_rep, second_rep);
-        }
-    }
-    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
-    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
-                                     out_row_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CHOLESKY_COMPUTE_SKELETON_TREE);
-
-
 template <typename ValueType, typename IndexType>
 void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* mtx,
@@ -154,44 +107,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename ValueType, typename IndexType>
-void forest_from_factor(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* factors,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto row_ptrs = factors->get_const_row_ptrs();
-    const auto col_idxs = factors->get_const_col_idxs();
-    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
-    const auto parents = forest.parents.get_data();
-    const auto children = forest.children.get_data();
-    const auto child_ptrs = forest.child_ptrs.get_data();
-    // filled with sentinel for unattached nodes
-    std::fill_n(parents, num_rows, num_rows);
-    for (IndexType row = 0; row < num_rows; row++) {
-        const auto row_begin = row_ptrs[row];
-        const auto row_end = row_ptrs[row + 1];
-        for (auto nz = row_begin; nz < row_end; nz++) {
-            const auto col = col_idxs[nz];
-            // use the lower triangle, min row from column
-            if (col < row) {
-                parents[col] = std::min(parents[col], row);
-            }
-        }
-    }
-    // group by parent
-    vector<IndexType> parents_copy(parents, parents + num_rows, {exec});
-    std::iota(children, children + num_rows, 0);
-    const auto it = detail::make_zip_iterator(parents_copy.begin(), children);
-    std::stable_sort(it, it + num_rows);
-    components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows,
-                                     num_rows + 1, child_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp
new file mode 100644
index 00000000000..bd26900b78e
--- /dev/null
+++ b/reference/factorization/elimination_forest_kernels.cpp
@@ -0,0 +1,116 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace elimination_forest {
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    disjoint_sets<IndexType> sets(exec, size);
+    const auto nnz = static_cast<size_type>(row_ptrs[size]);
+    vector<std::pair<IndexType, IndexType>> edges(exec);
+    edges.reserve(nnz);
+    // collect edge list
+    for (auto row : irange(static_cast<IndexType>(size))) {
+        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
+            const auto col = cols[nz];
+            // edge contains (max, min) pair
+            auto edge = std::minmax(row, col, std::greater{});
+            edges.push_back(edge);
+        }
+    }
+    // sort edge list ascending by edge weight
+    std::sort(edges.begin(), edges.end());
+    // output helper array: Store row indices for output rows
+    // since we sorted by edge.first == row, this will be sorted
+    std::vector<IndexType> out_rows(size);
+    IndexType output_count{};
+    // Kruskal algorithm: Connect unconnected components using edges with
+    // ascending weight
+    for (const auto edge : edges) {
+        const auto first_rep = sets.find(edge.first);
+        const auto second_rep = sets.find(edge.second);
+        if (first_rep != second_rep) {
+            // we are only interested in the lower triangle, so we add an edge
+            // max -> min
+            out_rows[output_count] = edge.first;
+            out_cols[output_count] = edge.second;
+            output_count++;
+            sets.join(first_rep, second_rep);
+        }
+    }
+    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
+    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto row_ptrs = factors->get_const_row_ptrs();
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
+    const auto parents = forest.parents.get_data();
+    const auto children = forest.children.get_data();
+    const auto child_ptrs = forest.child_ptrs.get_data();
+    // filled with sentinel for unattached nodes
+    std::fill_n(parents, num_rows, num_rows);
+    for (IndexType row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_end = row_ptrs[row + 1];
+        for (auto nz = row_begin; nz < row_end; nz++) {
+            const auto col = col_idxs[nz];
+            // use the lower triangle, min row from column
+            if (col < row) {
+                parents[col] = std::min(parents[col], row);
+            }
+        }
+    }
+    // group by parent
+    vector<IndexType> parents_copy(parents, parents + num_rows, {exec});
+    std::iota(children, children + num_rows, 0);
+    const auto it = detail::make_zip_iterator(parents_copy.begin(), children);
+    std::stable_sort(it, it + num_rows);
+    components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows,
+                                     num_rows + 1, child_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index cf72c13a0b1..9b47fcd6ad1 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -17,6 +17,7 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -248,7 +249,7 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
             gko::factorization::compute_elim_forest(this->mtx.get(),
                                                     this->forest);
 
-            gko::kernels::reference::cholesky::compute_skeleton_tree(
+            gko::kernels::reference::elimination_forest::compute_skeleton_tree(
                 this->ref, this->mtx->get_const_row_ptrs(),
                 this->mtx->get_const_col_idxs(), this->mtx->get_size()[0],
                 skeleton->get_row_ptrs(), skeleton->get_col_idxs());
@@ -360,7 +361,7 @@ TYPED_TEST(Cholesky, KernelForestFromFactorPlusPostprocessing)
             elimination_forest forest{this->ref,
                                       static_cast<index_type>(this->num_rows)};
 
-            gko::kernels::reference::cholesky::forest_from_factor(
+            gko::kernels::reference::elimination_forest::from_factor(
                 this->ref, combined_factor.get(), forest);
 
             this->assert_equal_forests(forest, *forest_ref);
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index a08b71d0959..f4c4424c4e4 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -18,6 +18,7 @@
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
@@ -171,12 +172,14 @@ TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
         const auto skeleton = matrix_type::create(this->ref, size, size[0]);
         const auto dskeleton = matrix_type::create(this->exec, size, size[0]);
 
-        gko::kernels::reference::cholesky::compute_skeleton_tree(
+        gko::kernels::reference::elimination_forest::compute_skeleton_tree(
             this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
             size[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs());
-        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::compute_skeleton_tree(
-            this->exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
-            size[0], dskeleton->get_row_ptrs(), dskeleton->get_col_idxs());
+        gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest::
+            compute_skeleton_tree(this->exec, dmtx->get_const_row_ptrs(),
+                                  dmtx->get_const_col_idxs(), size[0],
+                                  dskeleton->get_row_ptrs(),
+                                  dskeleton->get_col_idxs());
 
         // check that the created graphs are trees and have the same edge sum
         const auto weight_sum = this->check_mst(skeleton);
@@ -307,7 +310,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks)
         elimination_forest dforest{this->exec,
                                    static_cast<index_type>(mtx->get_size()[0])};
 
-        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor(
+        gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest::from_factor(
             this->exec, dfactors.get(), dforest);
 
         this->assert_equal_forests(*forest, dforest);

From 213ced61f7427920e538e7c4daec68fed460215b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sat, 11 Jan 2025 14:38:15 +0100
Subject: [PATCH 12/26] update copyright years

---
 common/cuda_hip/components/memory.nvidia.hpp.inc             | 2 +-
 common/cuda_hip/factorization/cholesky_kernels.cpp           | 2 +-
 common/cuda_hip/factorization/elimination_forest_kernels.cpp | 2 +-
 core/factorization/cholesky_kernels.hpp                      | 2 +-
 core/factorization/elimination_forest_kernels.hpp            | 2 +-
 core/factorization/symbolic.hpp                              | 2 +-
 core/test/utils/assertions.hpp                               | 2 +-
 dpcpp/factorization/cholesky_kernels.dp.cpp                  | 2 +-
 dpcpp/factorization/elimination_forest_kernels.dp.cpp        | 2 +-
 omp/factorization/cholesky_kernels.cpp                       | 2 +-
 omp/factorization/elimination_forest_kernels.cpp             | 2 +-
 reference/factorization/cholesky_kernels.cpp                 | 2 +-
 reference/factorization/elimination_forest_kernels.cpp       | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index cf8d80c64e9..18ae23ffc38 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 483144d8708..ce4dc823b98 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
index 6291b7628bc..d1a5a2dc8d4 100644
--- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp
index f80e057da3a..fa87edd7fe3 100644
--- a/core/factorization/cholesky_kernels.hpp
+++ b/core/factorization/cholesky_kernels.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp
index 8d76ee73b54..83484ccbffc 100644
--- a/core/factorization/elimination_forest_kernels.hpp
+++ b/core/factorization/elimination_forest_kernels.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp
index 84478e214e6..f3a6624a762 100644
--- a/core/factorization/symbolic.hpp
+++ b/core/factorization/symbolic.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index 13e5dfe0ea5..6b1014cdba0 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index 50bdbceaa75..76a2f9d2b6a 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
index 4aecc0993f9..95bd75e49e0 100644
--- a/dpcpp/factorization/elimination_forest_kernels.dp.cpp
+++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index 76b1154512b..826e06026da 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp
index e150877e3a2..e65f7852208 100644
--- a/omp/factorization/elimination_forest_kernels.cpp
+++ b/omp/factorization/elimination_forest_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index e489a3a2c57..dea639f376f 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp
index bd26900b78e..f0d46de5ca6 100644
--- a/reference/factorization/elimination_forest_kernels.cpp
+++ b/reference/factorization/elimination_forest_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 

From 7bb00ab48ea5494ffe669bf5604887d9f964ce9c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Tue, 31 Dec 2024 18:43:46 +0100
Subject: [PATCH 13/26] rename compute_elim_forest to
 compute_elimination_forest

---
 .../reference/sparse_blas.reordered.stdout    |  2 +-
 core/factorization/elimination_forest.cpp     | 47 ++++++++++---------
 core/factorization/elimination_forest.hpp     |  4 +-
 core/factorization/symbolic.cpp               |  7 +--
 .../test/factorization/elimination_forest.cpp | 12 ++---
 .../test/factorization/cholesky_kernels.cpp   | 23 ++++-----
 test/factorization/cholesky_kernels.cpp       | 23 ++++-----
 7 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout
index b5fc8998be0..f40bf8814dd 100644
--- a/benchmark/test/reference/sparse_blas.reordered.stdout
+++ b/benchmark/test/reference/sparse_blas.reordered.stdout
@@ -9,7 +9,7 @@
                 "bandwidth": 1.0,
                 "repetitions": 10,
                 "components": {
-                    "compute_elim_forest": 1.0,
+                    "compute_elimination_forest": 1.0,
                     "allocate": 1.0,
                     "free": 1.0,
                     "components::fill_array": 1.0,
diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp
index 1dc8ff060a0..59d7f3638d2 100644
--- a/core/factorization/elimination_forest.cpp
+++ b/core/factorization/elimination_forest.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -13,10 +13,9 @@ namespace {
 
 
 template <typename IndexType>
-void compute_elim_forest_parent_impl(std::shared_ptr<const Executor> host_exec,
-                                     const IndexType* row_ptrs,
-                                     const IndexType* cols, IndexType num_rows,
-                                     IndexType* parent)
+void compute_elimination_forest_parent_impl(
+    std::shared_ptr<const Executor> host_exec, const IndexType* row_ptrs,
+    const IndexType* cols, IndexType num_rows, IndexType* parent)
 {
     disjoint_sets<IndexType> subtrees{host_exec, num_rows};
     array<IndexType> subtree_root_array{host_exec,
@@ -50,8 +49,10 @@ void compute_elim_forest_parent_impl(std::shared_ptr<const Executor> host_exec,
 
 
 template <typename IndexType>
-void compute_elim_forest_children_impl(const IndexType* parent, IndexType size,
-                                       IndexType* child_ptr, IndexType* child)
+void compute_elimination_forest_children_impl(const IndexType* parent,
+                                              IndexType size,
+                                              IndexType* child_ptr,
+                                              IndexType* child)
 {
     // count how many times each parent occurs, excluding pseudo-root at
     // parent == size
@@ -74,7 +75,7 @@ void compute_elim_forest_children_impl(const IndexType* parent, IndexType size,
 
 
 template <typename IndexType>
-void compute_elim_forest_postorder_impl(
+void compute_elimination_forest_postorder_impl(
     std::shared_ptr<const Executor> host_exec, const IndexType* parent,
     const IndexType* child_ptr, const IndexType* child, IndexType size,
     IndexType* postorder, IndexType* inv_postorder)
@@ -111,10 +112,9 @@ void compute_elim_forest_postorder_impl(
 
 
 template <typename IndexType>
-void compute_elim_forest_postorder_parent_impl(const IndexType* parent,
-                                               const IndexType* inv_postorder,
-                                               IndexType size,
-                                               IndexType* postorder_parent)
+void compute_elimination_forest_postorder_parent_impl(
+    const IndexType* parent, const IndexType* inv_postorder, IndexType size,
+    IndexType* postorder_parent)
 {
     for (IndexType row = 0; row < size; row++) {
         postorder_parent[inv_postorder[row]] =
@@ -140,26 +140,27 @@ void elimination_forest<IndexType>::set_executor(
 
 
 template <typename ValueType, typename IndexType>
-void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
-                         std::unique_ptr<elimination_forest<IndexType>>& forest)
+void compute_elimination_forest(
+    const matrix::Csr<ValueType, IndexType>* mtx,
+    std::unique_ptr<elimination_forest<IndexType>>& forest)
 {
     const auto host_exec = mtx->get_executor()->get_master();
     const auto host_mtx = make_temporary_clone(host_exec, mtx);
     const auto num_rows = static_cast<IndexType>(host_mtx->get_size()[0]);
     forest =
         std::make_unique<elimination_forest<IndexType>>(host_exec, num_rows);
-    compute_elim_forest_parent_impl(host_exec, host_mtx->get_const_row_ptrs(),
-                                    host_mtx->get_const_col_idxs(), num_rows,
-                                    forest->parents.get_data());
-    compute_elim_forest_children_impl(forest->parents.get_const_data(),
-                                      num_rows, forest->child_ptrs.get_data(),
-                                      forest->children.get_data());
-    compute_elim_forest_postorder_impl(
+    compute_elimination_forest_parent_impl(
+        host_exec, host_mtx->get_const_row_ptrs(),
+        host_mtx->get_const_col_idxs(), num_rows, forest->parents.get_data());
+    compute_elimination_forest_children_impl(
+        forest->parents.get_const_data(), num_rows,
+        forest->child_ptrs.get_data(), forest->children.get_data());
+    compute_elimination_forest_postorder_impl(
         host_exec, forest->parents.get_const_data(),
         forest->child_ptrs.get_const_data(), forest->children.get_const_data(),
         num_rows, forest->postorder.get_data(),
         forest->inv_postorder.get_data());
-    compute_elim_forest_postorder_parent_impl(
+    compute_elimination_forest_postorder_parent_impl(
         forest->parents.get_const_data(),
         forest->inv_postorder.get_const_data(), num_rows,
         forest->postorder_parents.get_data());
@@ -169,7 +170,7 @@ void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
 
 
 #define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \
-    void compute_elim_forest(                                 \
+    void compute_elimination_forest(                          \
         const matrix::Csr<ValueType, IndexType>* mtx,         \
         std::unique_ptr<elimination_forest<IndexType>>& forest)
 
diff --git a/core/factorization/elimination_forest.hpp b/core/factorization/elimination_forest.hpp
index 5307a90384c..50864ec6597 100644
--- a/core/factorization/elimination_forest.hpp
+++ b/core/factorization/elimination_forest.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -41,7 +41,7 @@ struct elimination_forest {
 
 
 template <typename ValueType, typename IndexType>
-void compute_elim_forest(
+void compute_elimination_forest(
     const matrix::Csr<ValueType, IndexType>* mtx,
     std::unique_ptr<elimination_forest<IndexType>>& forest);
 
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index 9c7e1785396..4d4033f519c 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -35,7 +35,8 @@ GKO_REGISTER_OPERATION(symbolic_factorize_simple,
                        lu_factorization::symbolic_factorize_simple);
 GKO_REGISTER_OPERATION(symbolic_factorize_simple_finalize,
                        lu_factorization::symbolic_factorize_simple_finalize);
-GKO_REGISTER_HOST_OPERATION(compute_elim_forest, compute_elim_forest);
+GKO_REGISTER_HOST_OPERATION(compute_elimination_forest,
+                            compute_elimination_forest);
 
 
 }  // namespace
@@ -51,7 +52,7 @@ void symbolic_cholesky(
     GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
     const auto exec = mtx->get_executor();
     const auto host_exec = exec->get_master();
-    exec->run(make_compute_elim_forest(mtx, forest));
+    exec->run(make_compute_elimination_forest(mtx, forest));
     const auto num_rows = mtx->get_size()[0];
     array<IndexType> row_ptrs{exec, num_rows + 1};
     array<IndexType> tmp{exec};
@@ -100,7 +101,7 @@ void symbolic_cholesky_device(
         exec->run(make_compute_skeleton_tree(
             mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), num_rows,
             skeleton->get_row_ptrs(), skeleton->get_col_idxs()));
-        exec->run(make_compute_elim_forest(skeleton.get(), forest));
+        exec->run(make_compute_elimination_forest(skeleton.get(), forest));
     }
     array<IndexType> row_ptrs{exec, num_rows + 1};
     array<IndexType> tmp{exec};
diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp
index 292b366f50e..9bbc9536c93 100644
--- a/core/test/factorization/elimination_forest.cpp
+++ b/core/test/factorization/elimination_forest.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -55,7 +55,7 @@ TYPED_TEST(EliminationForest, WorksForExample)
         this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     GKO_ASSERT_ARRAY_EQ(forest->parents,
                         I<index_type>({2, 4, 6, 8, 8, 6, 7, 8, 9, 10}));
@@ -92,7 +92,7 @@ TYPED_TEST(EliminationForest, WorksForSeparable)
         this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     GKO_ASSERT_ARRAY_EQ(forest->parents,
                         I<index_type>({2, 2, 10, 4, 5, 9, 7, 9, 9, 10}));
@@ -128,7 +128,7 @@ TYPED_TEST(EliminationForest, WorksForPostOrderNotSelfInverse)
         },
         this->ref);
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
     GKO_ASSERT_ARRAY_EQ(forest->parents,
                         I<index_type>({2, 4, 6, 8, 5, 6, 7, 8, 9, 10}));
     GKO_ASSERT_ARRAY_EQ(forest->child_ptrs,
@@ -152,7 +152,7 @@ TYPED_TEST(EliminationForest, WorksForAni1)
     auto mtx = gko::read<matrix_type>(stream, this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     // the elimination tree is a path
     gko::array<index_type> iota_arr{this->ref, 36};
@@ -178,7 +178,7 @@ TYPED_TEST(EliminationForest, WorksForAni1Amd)
     auto mtx = gko::read<matrix_type>(stream, this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     GKO_ASSERT_ARRAY_EQ(
         forest->parents,
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 9b47fcd6ad1..17bb0a36426 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -92,7 +92,8 @@ class Cholesky : public ::testing::Test {
                                        l_factor_ref->get_num_stored_elements());
         combined = matrix_type::create(ref, combined_ref->get_size(),
                                        combined_ref->get_num_stored_elements());
-        gko::factorization::compute_elim_forest(l_factor_ref.get(), forest);
+        gko::factorization::compute_elimination_forest(l_factor_ref.get(),
+                                                       forest);
         // init sparsity lookup
         ref->copy(num_rows + 1, l_factor_ref->get_const_row_ptrs(),
                   l_factor->get_row_ptrs());
@@ -246,18 +247,18 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
             auto skeleton = matrix_type::create(
                 this->ref, this->mtx->get_size(), this->mtx->get_size()[0]);
             std::unique_ptr<elimination_forest> skeleton_forest;
-            gko::factorization::compute_elim_forest(this->mtx.get(),
-                                                    this->forest);
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
 
             gko::kernels::reference::elimination_forest::compute_skeleton_tree(
                 this->ref, this->mtx->get_const_row_ptrs(),
                 this->mtx->get_const_col_idxs(), this->mtx->get_size()[0],
                 skeleton->get_row_ptrs(), skeleton->get_col_idxs());
 
-            gko::factorization::compute_elim_forest(this->mtx.get(),
-                                                    this->forest);
-            gko::factorization::compute_elim_forest(skeleton.get(),
-                                                    skeleton_forest);
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
+            gko::factorization::compute_elimination_forest(skeleton.get(),
+                                                           skeleton_forest);
 
             this->assert_equal_forests(*skeleton_forest, *this->forest);
         },
@@ -273,8 +274,8 @@ TYPED_TEST(Cholesky, KernelSymbolicCount)
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
         [this] {
-            gko::factorization::compute_elim_forest(this->mtx.get(),
-                                                    this->forest);
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
             gko::array<index_type> row_nnz{this->ref, this->num_rows};
 
             gko::kernels::reference::cholesky::symbolic_count(
@@ -295,8 +296,8 @@ TYPED_TEST(Cholesky, KernelSymbolicFactorize)
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
         [this] {
-            gko::factorization::compute_elim_forest(this->mtx.get(),
-                                                    this->forest);
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
             gko::kernels::reference::cholesky::symbolic_count(
                 this->ref, this->mtx.get(), *this->forest,
                 this->l_factor->get_row_ptrs(), this->tmp);
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index f4c4424c4e4..ed422978086 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -190,11 +190,11 @@ TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
         std::unique_ptr<elimination_forest> forest;
         std::unique_ptr<elimination_forest> skeleton_forest;
         std::unique_ptr<elimination_forest> dskeleton_forest;
-        gko::factorization::compute_elim_forest(mtx.get(), forest);
-        gko::factorization::compute_elim_forest(skeleton.get(),
-                                                skeleton_forest);
-        gko::factorization::compute_elim_forest(dskeleton.get(),
-                                                dskeleton_forest);
+        gko::factorization::compute_elimination_forest(mtx.get(), forest);
+        gko::factorization::compute_elimination_forest(skeleton.get(),
+                                                       skeleton_forest);
+        gko::factorization::compute_elimination_forest(dskeleton.get(),
+                                                       dskeleton_forest);
         // the parents array fully determines the elimination forest
         GKO_ASSERT_ARRAY_EQ(forest->parents, skeleton_forest->parents);
         GKO_ASSERT_ARRAY_EQ(skeleton_forest->parents,
@@ -214,8 +214,8 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount)
         const auto dmtx = gko::clone(this->exec, mtx);
         std::unique_ptr<elimination_forest> forest;
         std::unique_ptr<elimination_forest> dforest;
-        gko::factorization::compute_elim_forest(mtx.get(), forest);
-        gko::factorization::compute_elim_forest(dmtx.get(), dforest);
+        gko::factorization::compute_elimination_forest(mtx.get(), forest);
+        gko::factorization::compute_elimination_forest(dmtx.get(), dforest);
         gko::array<index_type> row_nnz{this->ref, mtx->get_size()[0]};
         gko::array<index_type> drow_nnz{this->exec, mtx->get_size()[0]};
 
@@ -241,7 +241,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize)
         const auto dmtx = gko::clone(this->exec, mtx);
         const auto num_rows = mtx->get_size()[0];
         std::unique_ptr<elimination_forest> forest;
-        gko::factorization::compute_elim_forest(mtx.get(), forest);
+        gko::factorization::compute_elimination_forest(mtx.get(), forest);
         gko::array<index_type> row_ptrs{this->ref, num_rows + 1};
         gko::kernels::reference::cholesky::symbolic_count(
             this->ref, mtx.get(), *forest, row_ptrs.get_data(), this->tmp);
@@ -258,7 +258,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize)
             gko::array<index_type>{this->exec, nnz}, row_ptrs);
         // need to call the device kernels to initialize dtmp
         std::unique_ptr<elimination_forest> dforest;
-        gko::factorization::compute_elim_forest(dmtx.get(), dforest);
+        gko::factorization::compute_elimination_forest(dmtx.get(), dforest);
         gko::array<index_type> dtmp_ptrs{this->exec, num_rows + 1};
         gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count(
             this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp);
@@ -358,8 +358,9 @@ class Cholesky : public CommonTestFixture {
         mtx_chol_sparsity->copy_from(mtx_chol.get());
         dmtx_chol_sparsity = sparsity_pattern_type::create(exec);
         dmtx_chol_sparsity->copy_from(mtx_chol_sparsity.get());
-        gko::factorization::compute_elim_forest(mtx_chol.get(), forest);
-        gko::factorization::compute_elim_forest(dmtx_chol.get(), dforest);
+        gko::factorization::compute_elimination_forest(mtx_chol.get(), forest);
+        gko::factorization::compute_elimination_forest(dmtx_chol.get(),
+                                                       dforest);
     }
 
     void forall_matrices(std::function<void()> fn)

From 6bff427e0ac1c9469311a0c5f47e3593f95b469e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 10:24:07 +0100
Subject: [PATCH 14/26] consider only lower triangular entries for skeleton
 tree input

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 reference/factorization/elimination_forest_kernels.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp
index f0d46de5ca6..7fc0df8c4ec 100644
--- a/reference/factorization/elimination_forest_kernels.cpp
+++ b/reference/factorization/elimination_forest_kernels.cpp
@@ -39,9 +39,11 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
     for (auto row : irange(static_cast<IndexType>(size))) {
         for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
             const auto col = cols[nz];
+            if (col >= row) {
+                continue;
+            }
             // edge contains (max, min) pair
-            auto edge = std::minmax(row, col, std::greater{});
-            edges.push_back(edge);
+            edges.emplace_back(row, col);
         }
     }
     // sort edge list ascending by edge weight

From 5aa16c8c47b527c17e69395ed346b13390be5674 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 10:34:58 +0100
Subject: [PATCH 15/26] fix memory script

---
 dev_tools/scripts/generate_cuda_memory_ptx.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index ed6a6cb32cf..265263f130f 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -62,7 +62,7 @@ class operation:
               operation(fn_op_suffix="_and", ptx_op_suffix=".and", supports_float=False, supports_signed=False),
               operation(fn_op_suffix="_or", ptx_op_suffix=".or", supports_float=False, supports_signed=False)]
 # header
-print("""// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+print("""// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -289,7 +289,8 @@ class operation:
 
 for s in memory_spaces_without_shared:
     o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
-                 ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True)
+                 ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", 
+                 ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True)
     const_ptr_expr = s.ptr_expr.format(
         ptr=f"const_cast<thrust::complex<{t.name}>*>(ptr)")
     mut_ptr_expr = s.ptr_expr.format(ptr="ptr")

From 941ceb7ec83976ff02bbf4be4d3ce99331cd3ff0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 10:36:18 +0100
Subject: [PATCH 16/26] remove unnecessary tabs/newlines from generation script

---
 dev_tools/scripts/generate_cuda_memory_ptx.py | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index 265263f130f..f5c995a76a2 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -253,14 +253,14 @@ class operation:
 __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
 {{
     {t.parent_name} result;
-    asm volatile("{{\\n\\t"
-        "  .reg {t.ptx_type_suffix} t;\\n\\t"
+    asm volatile("{{"
+        "  .reg {t.ptx_type_suffix} t;"
     #if __CUDA_ARCH__ < 700
-        "  ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t"
+        "  ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];"
     #else
-        "  ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t"
+        "  ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];"
     #endif
-        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;"
         "}}"
         : "={t.val_constraint}"(result)
         : "{s.ptr_constraint}"({const_ptr_expr})
@@ -273,13 +273,13 @@ class operation:
 __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result)
 {{
     {membar_expression}
-    asm volatile("{{\\n\\t"
-        "  .reg {t.ptx_type_suffix} t;\\n\\t"
-        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t"
+    asm volatile("{{"
+        "  .reg {t.ptx_type_suffix} t;"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;"
     #if __CUDA_ARCH__ < 700
-        "  st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "  st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;"
     #else
-        "  st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "  st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;"
     #endif
         "}}"
         :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result))
@@ -299,15 +299,15 @@ class operation:
 {{
     {t.parent_name} real_result;
     {t.parent_name} imag_result;
-    asm volatile("{{\\n\\t"
-        "  .reg .v2 {t.ptx_type_suffix} t;\\n\\t"
+    asm volatile("{{"
+        "  .reg .v2 {t.ptx_type_suffix} t;"
 #if __CUDA_ARCH__ < 700
-        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t"
+        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];"
 #else
-        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t"
+        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];"
 #endif
-        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t"
-        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;"
         "}}"
         : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
         : "{s.ptr_constraint}"({const_ptr_expr})
@@ -320,14 +320,14 @@ class operation:
 {{
     auto real_result = static_cast<{t.parent_name}>(result.real());
     auto imag_result = static_cast<{t.parent_name}>(result.imag());
-    asm volatile("{{\\n\\t"
-        "  .reg .v2 {t.ptx_type_suffix} t;\\n\\t"
-        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t"
-        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t"
+    asm volatile("{{"
+        "  .reg .v2 {t.ptx_type_suffix} t;"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;"
 #if __CUDA_ARCH__ < 700
-        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;"
 #else
-        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;"
 #endif
         "}}"
         :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)

From 418714b5ec2f56bcc7456fc3bb768bf377b30fc3 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 10:36:43 +0100
Subject: [PATCH 17/26] rebuild memory.nvidia.hpp.inc

---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 128 +++++++++---------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index 18ae23ffc38..b9cef35ac77 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -2423,14 +2423,14 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.relaxed.cta.b16 t, [%1];\n\t"
+        "  ld.relaxed.cta.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -2443,13 +2443,13 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
 __device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result)
 {
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.relaxed.cta.b16 [%0], t;\n\t"
+        "  st.relaxed.cta.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -2461,14 +2461,14 @@ __device__ __forceinline__ __half load_acquire_local(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.acquire.cta.b16 t, [%1];\n\t"
+        "  ld.acquire.cta.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -2482,13 +2482,13 @@ __device__ __forceinline__ void store_release_local(__half* ptr, __half result)
 {
     membar_acq_rel_local();
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.release.cta.b16 [%0], t;\n\t"
+        "  st.release.cta.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -2500,14 +2500,14 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.relaxed.gpu.b16 t, [%1];\n\t"
+        "  ld.relaxed.gpu.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -2520,13 +2520,13 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr)
 __device__ __forceinline__ void store_relaxed(__half* ptr, __half result)
 {
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.relaxed.gpu.b16 [%0], t;\n\t"
+        "  st.relaxed.gpu.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -2538,14 +2538,14 @@ __device__ __forceinline__ __half load_acquire(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.acquire.gpu.b16 t, [%1];\n\t"
+        "  ld.acquire.gpu.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -2559,13 +2559,13 @@ __device__ __forceinline__ void store_release(__half* ptr, __half result)
 {
     membar_acq_rel();
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.release.gpu.b16 [%0], t;\n\t"
+        "  st.release.gpu.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -2579,15 +2579,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed_local(
     float real_result;
     float imag_result;
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.volatile.v2.b16 {t.x, t.y}, [%2];"
 #else
-        "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];"
 #endif
-        "  cvt.f32.f16 %0, t.x;\n\t"
-        "  cvt.f32.f16 %1, t.y;\n\t"
+        "  cvt.f32.f16 %0, t.x;"
+        "  cvt.f32.f16 %1, t.y;"
         "}"
         : "=f"(real_result), "=f"(imag_result)
         : "l"(const_cast<thrust::complex<__half>*>(ptr))
@@ -2602,14 +2602,14 @@ __device__ __forceinline__ void store_relaxed_local(
     auto real_result = static_cast<float>(result.real());
     auto imag_result = static_cast<float>(result.imag());
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t.x, %1;\n\t"
-        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
+        "  cvt.rn.f16.f32 t.x, %1;"
+        "  cvt.rn.f16.f32 t.y, %2;"
 #if __CUDA_ARCH__ < 700
-        "st.volatile.v2.b16 [%0], t;\n\t"
+        "st.volatile.v2.b16 [%0], t;"
 #else
-        "st.relaxed.cta.v2.b16 [%0], t;\n\t"
+        "st.relaxed.cta.v2.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(real_result), "f"(imag_result)
@@ -2623,15 +2623,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed(
     float real_result;
     float imag_result;
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.volatile.v2.b16 {t.x, t.y}, [%2];"
 #else
-        "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];"
 #endif
-        "  cvt.f32.f16 %0, t.x;\n\t"
-        "  cvt.f32.f16 %1, t.y;\n\t"
+        "  cvt.f32.f16 %0, t.x;"
+        "  cvt.f32.f16 %1, t.y;"
         "}"
         : "=f"(real_result), "=f"(imag_result)
         : "l"(const_cast<thrust::complex<__half>*>(ptr))
@@ -2646,14 +2646,14 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr,
     auto real_result = static_cast<float>(result.real());
     auto imag_result = static_cast<float>(result.imag());
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t.x, %1;\n\t"
-        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
+        "  cvt.rn.f16.f32 t.x, %1;"
+        "  cvt.rn.f16.f32 t.y, %2;"
 #if __CUDA_ARCH__ < 700
-        "st.volatile.v2.b16 [%0], t;\n\t"
+        "st.volatile.v2.b16 [%0], t;"
 #else
-        "st.relaxed.gpu.v2.b16 [%0], t;\n\t"
+        "st.relaxed.gpu.v2.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(real_result), "f"(imag_result)

From bfd2598cc81da2884434d382ca337bb6e4c34e80 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 10:53:38 +0100
Subject: [PATCH 18/26] review updates

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
Co-authored-by: Yu-Hsiang M. Tsai <yhmtsai@gmail.com>
---
 benchmark/sparse_blas/operations.cpp          |  4 +--
 .../elimination_forest_kernels.cpp            |  6 +---
 core/device_hooks/common_kernels.inc.cpp      |  2 --
 core/factorization/elimination_forest.cpp     |  9 +++---
 .../elimination_forest_kernels.hpp            | 29 ++++---------------
 .../elimination_forest_kernels.cpp            | 11 ++++---
 reference/factorization/cholesky_kernels.cpp  |  3 --
 .../elimination_forest_kernels.cpp            |  6 ++--
 .../test/factorization/cholesky_kernels.cpp   |  7 ++---
 test/factorization/cholesky_kernels.cpp       |  2 --
 10 files changed, 25 insertions(+), 54 deletions(-)

diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp
index 1b4e5819793..50c86c19bcf 100644
--- a/benchmark/sparse_blas/operations.cpp
+++ b/benchmark/sparse_blas/operations.cpp
@@ -616,7 +616,7 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 public:
     explicit SymbolicCholeskyOperation(const Mtx* mtx, bool device,
                                        bool symmetric)
-        : mtx_{mtx}, symmetric_{symmetric}, device_{device}, result_{}
+        : mtx_{mtx}, device_{device}, symmetric_{symmetric}, result_{}
     {}
 
     std::pair<bool, double> validate() const override
@@ -660,8 +660,8 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 
 private:
     const Mtx* mtx_;
-    bool symmetric_;
     bool device_;
+    bool symmetric_;
     std::unique_ptr<Mtx> result_;
     std::unique_ptr<gko::factorization::elimination_forest<itype>> forest_;
 };
diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
index d1a5a2dc8d4..41770e29a65 100644
--- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -18,17 +18,13 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/sparselib_bindings.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/syncfree.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
-#include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest_kernels.hpp"
 
 
@@ -232,7 +228,7 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
     const auto worklist_size =
         ceildiv(nnz, config::warp_size * 2) * config::warp_size;
     // create 2 worklists consisting of (start, end, edge_id)
-    array<IndexType> worklist{exec, static_cast<size_type>(worklist_size * 8)};
+    array<IndexType> worklist{exec, static_cast<size_type>(worklist_size * 6)};
     auto wl1_source = worklist.get_data();
     auto wl1_target = wl1_source + worklist_size;
     auto wl1_edge_id = wl1_target + worklist_size;
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 19c9917ed4b..437e1145c40 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -941,8 +941,6 @@ namespace elimination_forest {
 
 
 GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
-GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES);
-GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
 
 }  // namespace elimination_forest
diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp
index 59d7f3638d2..89bd9505f6d 100644
--- a/core/factorization/elimination_forest.cpp
+++ b/core/factorization/elimination_forest.cpp
@@ -169,12 +169,13 @@ void compute_elimination_forest(
 }
 
 
-#define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \
-    void compute_elimination_forest(                          \
-        const matrix::Csr<ValueType, IndexType>* mtx,         \
+#define GKO_DECLARE_COMPUTE_ELIMINATION_FOREST(ValueType, IndexType) \
+    void compute_elimination_forest(                                 \
+        const matrix::Csr<ValueType, IndexType>* mtx,                \
         std::unique_ptr<elimination_forest<IndexType>>& forest)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COMPUTE_ELIMINATION_FOREST);
 
 
 }  // namespace factorization
diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp
index 83484ccbffc..39435846119 100644
--- a/core/factorization/elimination_forest_kernels.hpp
+++ b/core/factorization/elimination_forest_kernels.hpp
@@ -35,30 +35,11 @@ namespace kernels {
         gko::factorization::elimination_forest<IndexType>& forest)
 
 
-#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType)  \
-    void compute_subtree_sizes(                                          \
-        std::shared_ptr<const DefaultExecutor> exec,                     \
-        const gko::factorization::elimination_forest<IndexType>& forest, \
-        IndexType* subtree_size)
-
-
-#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType)      \
-    void compute_postorder(                                              \
-        std::shared_ptr<const DefaultExecutor> exec,                     \
-        const gko::factorization::elimination_forest<IndexType>& forest, \
-        const IndexType* subtree_size, IndexType* postorder,             \
-        IndexType* inv_postorder)
-
-
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                  \
-    template <typename IndexType>                                     \
-    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType);  \
-    template <typename ValueType, typename IndexType>                 \
-    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType); \
-    template <typename IndexType>                                     \
-    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SUBTREE_SIZES(IndexType);  \
-    template <typename IndexType>                                     \
-    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_POSTORDER(IndexType)
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                 \
+    template <typename IndexType>                                    \
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType); \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(elimination_forest,
diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp
index e65f7852208..57bd9a04530 100644
--- a/omp/factorization/elimination_forest_kernels.cpp
+++ b/omp/factorization/elimination_forest_kernels.cpp
@@ -31,21 +31,24 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
 {
     disjoint_sets<IndexType> sets(exec, size);
     const auto nnz = static_cast<size_type>(row_ptrs[size]);
-    vector<std::pair<IndexType, IndexType>> edges(nnz, exec);
+    vector<std::pair<IndexType, IndexType>> edges(exec);
+    edges.reserve(nnz / 2);
     // collect edge list
     for (auto row : irange(static_cast<IndexType>(size))) {
         for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
             const auto col = cols[nz];
+            if (col >= row) {
+                continue;
+            }
             // edge contains (max, min) pair
-            auto edge = std::minmax(row, col, std::greater{});
-            edges.push_back(edge);
+            edges.emplace_back(row, col);
         }
     }
     // sort edge list ascending by edge weight
     std::sort(edges.begin(), edges.end());
     // output helper array: Store row indices for output rows
     // since we sorted by edge.first == row, this will be sorted
-    std::vector<IndexType> out_rows(size);
+    vector<IndexType> out_rows(size, exec);
     IndexType output_count{};
     // Kruskal algorithm: Connect unconnected components using edges with
     // ascending weight
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index dea639f376f..80b6c53663b 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -6,16 +6,13 @@
 
 #include <algorithm>
 #include <memory>
-#include <numeric>
 
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/base/allocator.hpp"
-#include "core/base/index_range.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp
index 7fc0df8c4ec..a86aac7f281 100644
--- a/reference/factorization/elimination_forest_kernels.cpp
+++ b/reference/factorization/elimination_forest_kernels.cpp
@@ -13,9 +13,7 @@
 #include "core/base/allocator.hpp"
 #include "core/base/index_range.hpp"
 #include "core/base/iterator_factory.hpp"
-#include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest_kernels.hpp"
 
 
@@ -34,7 +32,7 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
     disjoint_sets<IndexType> sets(exec, size);
     const auto nnz = static_cast<size_type>(row_ptrs[size]);
     vector<std::pair<IndexType, IndexType>> edges(exec);
-    edges.reserve(nnz);
+    edges.reserve(nnz / 2);
     // collect edge list
     for (auto row : irange(static_cast<IndexType>(size))) {
         for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
@@ -50,7 +48,7 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
     std::sort(edges.begin(), edges.end());
     // output helper array: Store row indices for output rows
     // since we sorted by edge.first == row, this will be sorted
-    std::vector<IndexType> out_rows(size);
+    vector<IndexType> out_rows(size, exec);
     IndexType output_count{};
     // Kruskal algorithm: Connect unconnected components using edges with
     // ascending weight
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 17bb0a36426..1227c7e5e3d 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -142,6 +142,9 @@ class Cholesky : public ::testing::Test {
             fn();
         }
         {
+            // structurally this is the example from Liu 1990
+            // "The Role of Elimination Trees in Sparse Factorization"
+            // https://doi.org/10.1137/0611010.
             SCOPED_TRACE("example");
             this->setup(
                 {{4, 0, 1, 0, 0, 0, 0, 1, 0, 0},
@@ -238,7 +241,6 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes,
 TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
@@ -249,7 +251,6 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
             std::unique_ptr<elimination_forest> skeleton_forest;
             gko::factorization::compute_elimination_forest(this->mtx.get(),
                                                            this->forest);
-
             gko::kernels::reference::elimination_forest::compute_skeleton_tree(
                 this->ref, this->mtx->get_const_row_ptrs(),
                 this->mtx->get_const_col_idxs(), this->mtx->get_size()[0],
@@ -269,7 +270,6 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
 TYPED_TEST(Cholesky, KernelSymbolicCount)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
@@ -291,7 +291,6 @@ TYPED_TEST(Cholesky, KernelSymbolicCount)
 TYPED_TEST(Cholesky, KernelSymbolicFactorize)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index ed422978086..e2696f11d89 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -50,7 +50,6 @@ class CholeskySymbolic : public CommonTestFixture {
                 {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 1, 0, 1}}, ref));
         // this is the example from Liu 1990 https://doi.org/10.1137/0611010.
         // "The Role of Elimination Trees in Sparse Factorization"
-
         matrices.emplace_back("example", gko::initialize<matrix_type>(
                                              {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0},
                                               {0, 1, 0, 0, 1, 0, 0, 0, 0, 1},
@@ -159,7 +158,6 @@ TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator);
 TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     using elimination_forest = typename TestFixture::elimination_forest;
     for (const auto& pair : this->matrices) {

From 8656cdf95df90ce1677e8043c0807ef43c564a6b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 10:57:41 +0100
Subject: [PATCH 19/26] remove unnecessary host_exec

---
 core/factorization/symbolic.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index 4d4033f519c..92d0bf663cd 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -51,7 +51,6 @@ void symbolic_cholesky(
     using matrix_type = matrix::Csr<ValueType, IndexType>;
     GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
     const auto exec = mtx->get_executor();
-    const auto host_exec = exec->get_master();
     exec->run(make_compute_elimination_forest(mtx, forest));
     const auto num_rows = mtx->get_size()[0];
     array<IndexType> row_ptrs{exec, num_rows + 1};
@@ -94,7 +93,6 @@ void symbolic_cholesky_device(
     GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
     const auto exec = mtx->get_executor();
     const auto num_rows = mtx->get_size()[0];
-    const auto host_exec = exec->get_master();
     {
         const auto skeleton =
             matrix_type::create(exec, mtx->get_size(), num_rows);

From aca0fdc44cf6748de2c6aadae6132dd47bcb3171 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 14:05:30 +0100
Subject: [PATCH 20/26] clarify sortedness in MST algorithm

---
 omp/factorization/elimination_forest_kernels.cpp       | 8 +++++---
 reference/factorization/elimination_forest_kernels.cpp | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp
index 57bd9a04530..f3f180a515d 100644
--- a/omp/factorization/elimination_forest_kernels.cpp
+++ b/omp/factorization/elimination_forest_kernels.cpp
@@ -44,10 +44,12 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
             edges.emplace_back(row, col);
         }
     }
-    // sort edge list ascending by edge weight
-    std::sort(edges.begin(), edges.end());
+    // the edge list is now sorted by row, which also matches the edge weight
+    // we don't need to do any additional sorting operations
+    assert(std::is_sorted(edges.begin(), edges.end(),
+                          [](auto a, auto b) { return a.first < b.first; }));
     // output helper array: Store row indices for output rows
-    // since we sorted by edge.first == row, this will be sorted
+    // since the input is sorted by edge.first == row, this will be sorted
     vector<IndexType> out_rows(size, exec);
     IndexType output_count{};
     // Kruskal algorithm: Connect unconnected components using edges with
diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp
index a86aac7f281..dbf00d02d72 100644
--- a/reference/factorization/elimination_forest_kernels.cpp
+++ b/reference/factorization/elimination_forest_kernels.cpp
@@ -44,10 +44,12 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
             edges.emplace_back(row, col);
         }
     }
-    // sort edge list ascending by edge weight
-    std::sort(edges.begin(), edges.end());
+    // the edge list is now sorted by row, which also matches the edge weight
+    // we don't need to do any additional sorting operations
+    assert(std::is_sorted(edges.begin(), edges.end(),
+                          [](auto a, auto b) { return a.first < b.first; }));
     // output helper array: Store row indices for output rows
-    // since we sorted by edge.first == row, this will be sorted
+    // since the input is sorted by edge.first == row, this will be sorted
     vector<IndexType> out_rows(size, exec);
     IndexType output_count{};
     // Kruskal algorithm: Connect unconnected components using edges with

From dbbdd5ef70a71443fc9506ec45b69f1404f87dbc Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 14:06:47 +0100
Subject: [PATCH 21/26] remove unnecessary type alias

---
 reference/test/factorization/cholesky_kernels.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 1227c7e5e3d..0638f8ca931 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -242,7 +242,6 @@ TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
 {
     using matrix_type = typename TestFixture::matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
-    using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
         [this] {

From 5c7c56541788b4ee75bb3c5752991f402aa6908a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 14:14:08 +0100
Subject: [PATCH 22/26] remove dependency on core for disjoint_sets

---
 core/components/disjoint_sets.hpp | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/core/components/disjoint_sets.hpp b/core/components/disjoint_sets.hpp
index 601817e69d3..77a9177d172 100644
--- a/core/components/disjoint_sets.hpp
+++ b/core/components/disjoint_sets.hpp
@@ -1,12 +1,11 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_
 #define GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_
 
-
-#include <ginkgo/core/base/array.hpp>
+#include "core/base/allocator.hpp"
 
 
 namespace gko {
@@ -31,10 +30,8 @@ class disjoint_sets {
      *              allocate storage.
      */
     explicit disjoint_sets(std::shared_ptr<const Executor> exec, IndexType size)
-        : parents_{exec->get_master(), static_cast<size_type>(size)}
-    {
-        parents_.fill(-1);
-    }
+        : parents_{static_cast<size_type>(size), -1, exec->get_master()}
+    {}
 
     /**
      * Returns true if and only if the element x is the representative of its
@@ -138,17 +135,14 @@ class disjoint_sets {
      *
      * @return the number of elements represented in this data structure.
      */
-    IndexType get_size() const { return parents_.get_size(); }
+    IndexType get_size() const { return parents_.size(); }
 
 private:
-    const IndexType& parent(IndexType x) const
-    {
-        return parents_.get_const_data()[x];
-    }
+    const IndexType& parent(IndexType x) const { return parents_[x]; }
 
-    IndexType& parent(IndexType x) { return parents_.get_data()[x]; }
+    IndexType& parent(IndexType x) { return parents_[x]; }
 
-    array<IndexType> parents_;
+    vector<IndexType> parents_;
 };
 
 

From fb344a470462ea3e645b927e90becec4271b98de Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 14:35:26 +0100
Subject: [PATCH 23/26] add literature reference

---
 .../cuda_hip/factorization/elimination_forest_kernels.cpp  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
index 41770e29a65..069d448ea37 100644
--- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -215,6 +215,13 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
                            size_type size, IndexType* out_row_ptrs,
                            IndexType* out_cols)
 {
+    // This is a minimum spanning tree algorithm implementation based on
+    // A. Fallin, A. Gonzalez, J. Seo, and M. Burtscher,
+    // "A High-Performance MST Implementation for GPUs,”
+    // doi: 10.1145/3581784.3607093
+    // we don't filter heavy edges since the heaviest edges are necessary to
+    // reach the last node and we don't need to sort since the COO format
+    // already sorts by row index.
     const auto policy = thrust_policy(exec);
     const auto nnz = exec->copy_val_to_host(row_ptrs + size);
     // convert edges to COO representation

From 706d58472225126d4079a841d6f1a8278ba92984 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Wed, 15 Jan 2025 15:40:58 +0100
Subject: [PATCH 24/26] fix dpcpp compilation

---
 dpcpp/factorization/elimination_forest_kernels.dp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
index 95bd75e49e0..01310b5c448 100644
--- a/dpcpp/factorization/elimination_forest_kernels.dp.cpp
+++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
@@ -17,7 +17,7 @@
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-namespace climination_forest {
+namespace elimination_forest {
 
 
 template <typename IndexType>
@@ -40,7 +40,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
 
 
-}  // namespace climination_forest
+}  // namespace elimination_forest
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko

From 0debdc6c6e230ba67a805dd5905aadb2f422e005 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 16 Jan 2025 11:28:58 +0100
Subject: [PATCH 25/26] use atomic wrappers

---
 .../elimination_forest_kernels.cpp            | 19 ++++----------
 hip/components/memory.hip.hpp                 | 26 ++++++++++++++++++-
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
index 069d448ea37..553032dcf2c 100644
--- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -48,8 +48,6 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
     IndexType* __restrict__ worklist_edge_ids,
     IndexType* __restrict__ worklist_counter)
 {
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -57,8 +55,7 @@ __global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
     const auto row = rows[i];
     const auto col = cols[i];
     if (col < row) {
-        const auto out_i = static_cast<IndexType>(atomicAdd(
-            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
+        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
         worklist_sources[out_i] = row;
         worklist_targets[out_i] = col;
         worklist_edge_ids[out_i] = i;
@@ -93,12 +90,9 @@ __device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node)
 template <typename IndexType>
 __device__ void guarded_atomic_min(IndexType* ptr, IndexType value)
 {
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
     // only execute the atomic if we know that it might have an effect
     if (load_relaxed_local(ptr) > value) {
-        atomicMin(reinterpret_cast<atomic_type*>(ptr),
-                  static_cast<atomic_type>(value));
+        atomic_min_relaxed(ptr, value);
     }
 }
 
@@ -114,8 +108,6 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum(
     IndexType* __restrict__ worklist_edge_ids,
     IndexType* __restrict__ worklist_counter)
 {
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -126,8 +118,7 @@ __global__ __launch_bounds__(default_block_size) void mst_find_minimum(
     const auto source_rep = mst_find(parents, source);
     const auto target_rep = mst_find(parents, target);
     if (source_rep != target_rep) {
-        const auto out_i = static_cast<IndexType>(atomicAdd(
-            reinterpret_cast<atomic_type*>(worklist_counter), atomic_type{1}));
+        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
         worklist_sources[out_i] = source_rep;
         worklist_targets[out_i] = target_rep;
         worklist_edge_ids[out_i] = edge_id;
@@ -178,8 +169,7 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
                 repeat = true;
             }
         } while (repeat);
-        const auto out_i = static_cast<IndexType>(atomicAdd(
-            reinterpret_cast<atomic_type*>(out_counter), atomic_type{1}));
+        const auto out_i = atomic_add_relaxed(out_counter, 1);
         out_sources[out_i] = edge_sources[edge_id];
         out_targets[out_i] = edge_targets[edge_id];
     }
@@ -309,6 +299,7 @@ void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
         }
     }
     const auto num_mst_edges = exec->copy_val_to_host(output_counter);
+    // two separate sort calls get turned into efficient RadixSort invocations
     thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows);
     thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges,
                                out_cols);
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index 8a98ee822b8..d279a8ac9de 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -128,6 +128,30 @@ __device__ __forceinline__ void store_generic(ValueType* ptr, ValueType value)
 }
 
 
+template <typename ValueType, typename AddType>
+__device__ __forceinline__ ValueType atomic_add_relaxed(ValueType* ptr,
+                                                        AddType value)
+{
+    return __atomic_fetch_add(ptr, value, __ATOMIC_RELAXED);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_min_relaxed(ValueType* ptr,
+                                                        ValueType value)
+{
+    return __atomic_fetch_min(ptr, value, __ATOMIC_RELAXED);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_max_relaxed(ValueType* ptr,
+                                                        ValueType value)
+{
+    return __atomic_fetch_max(ptr, value, __ATOMIC_RELAXED);
+}
+
+
 template <typename ValueType>
 __device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr)
 {

From 7ff5f5917198f6d70df21182abebe2c1224bd3ba Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Thu, 16 Jan 2025 12:57:44 +0100
Subject: [PATCH 26/26] improve atomic usage to avoid casts

---
 .../cuda_hip/components/memory.nvidia.hpp.inc | 238 ++++++++++++++++++
 .../elimination_forest_kernels.cpp            |   6 +-
 .../cuda_hip/matrix/csr_kernels.template.cpp  |   6 +-
 common/cuda_hip/reorder/rcm_kernels.cpp       |  36 +--
 dev_tools/scripts/generate_cuda_memory_ptx.py |  22 +-
 hip/components/memory.hip.hpp                 |  21 ++
 6 files changed, 291 insertions(+), 38 deletions(-)

diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index b9cef35ac77..f77481cd793 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -153,6 +153,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr,
 }
 
 
+__device__ __forceinline__ int32 atomic_cas_relaxed_shared(int32* ptr,
+                                                           int32 old_val,
+                                                           int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 {
     int64 result;
@@ -241,6 +261,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr,
 }
 
 
+__device__ __forceinline__ int64 atomic_cas_relaxed_shared(int64* ptr,
+                                                           int64 old_val,
+                                                           int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 {
     float result;
@@ -467,6 +507,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr,
 }
 
 
+__device__ __forceinline__ uint32 atomic_cas_relaxed_shared(uint32* ptr,
+                                                            uint32 old_val,
+                                                            uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr)
 {
     uint64 result;
@@ -593,6 +653,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr,
 }
 
 
+__device__ __forceinline__ uint64 atomic_cas_relaxed_shared(uint64* ptr,
+                                                            uint64 old_val,
+                                                            uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
 {
     int32 result;
@@ -873,6 +953,26 @@ __device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr,
 }
 
 
+__device__ __forceinline__ int32 atomic_cas_relaxed_local(int32* ptr,
+                                                          int32 old_val,
+                                                          int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
 {
     int64 result;
@@ -961,6 +1061,26 @@ __device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr,
 }
 
 
+__device__ __forceinline__ int64 atomic_cas_relaxed_local(int64* ptr,
+                                                          int64 old_val,
+                                                          int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 {
     float result;
@@ -1187,6 +1307,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr,
 }
 
 
+__device__ __forceinline__ uint32 atomic_cas_relaxed_local(uint32* ptr,
+                                                           uint32 old_val,
+                                                           uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr)
 {
     uint64 result;
@@ -1313,6 +1453,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr,
 }
 
 
+__device__ __forceinline__ uint64 atomic_cas_relaxed_local(uint64* ptr,
+                                                           uint64 old_val,
+                                                           uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
 {
     int32 result;
@@ -1590,6 +1750,25 @@ __device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value)
 }
 
 
+__device__ __forceinline__ int32 atomic_cas_relaxed(int32* ptr, int32 old_val,
+                                                    int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed(const int64* ptr)
 {
     int64 result;
@@ -1675,6 +1854,25 @@ __device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value)
 }
 
 
+__device__ __forceinline__ int64 atomic_cas_relaxed(int64* ptr, int64 old_val,
+                                                    int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed(const float* ptr)
 {
     float result;
@@ -1894,6 +2092,26 @@ __device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value)
 }
 
 
+__device__ __forceinline__ uint32 atomic_cas_relaxed(uint32* ptr,
+                                                     uint32 old_val,
+                                                     uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ uint64 load_relaxed(const uint64* ptr)
 {
     uint64 result;
@@ -2015,6 +2233,26 @@ __device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value)
 }
 
 
+__device__ __forceinline__ uint64 atomic_cas_relaxed(uint64* ptr,
+                                                     uint64 old_val,
+                                                     uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int32 load_acquire(const int32* ptr)
 {
     int32 result;
diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
index 553032dcf2c..52002e9a211 100644
--- a/common/cuda_hip/factorization/elimination_forest_kernels.cpp
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -139,8 +139,6 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
     IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
     IndexType* __restrict__ out_counter)
 {
-    using atomic_type = std::conditional_t<std::is_same_v<IndexType, int32>,
-                                           int32, unsigned long long>;
     const auto i = thread::get_thread_id_flat<IndexType>();
     if (i >= size) {
         return;
@@ -159,9 +157,7 @@ __global__ __launch_bounds__(default_block_size) void mst_join_edges(
         do {
             repeat = false;
             auto old_parent =
-                atomicCAS(reinterpret_cast<atomic_type*>(parents + old_rep),
-                          static_cast<atomic_type>(old_rep),
-                          static_cast<atomic_type>(new_rep));
+                atomic_cas_relaxed(parents + old_rep, old_rep, new_rep);
             // if this fails, the parent of old_rep changed recently, so we need
             // to try again by updating the parent's parent (hopefully its rep)
             if (old_parent != old_rep) {
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index bd2423d4306..8b3167eca97 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -35,6 +35,7 @@
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/format_conversion.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/merging.hpp"
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
@@ -1293,7 +1294,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup(
         }
 #else
         if (i < row_len) {
-            while (atomicCAS(local_storage + hash, empty, i) != empty) {
+            while (atomic_cas_relaxed_local(local_storage + hash, empty,
+                                            static_cast<int32>(i)) != empty) {
                 hash++;
                 if (hash >= available_storage) {
                     hash = 0;
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 2bb18cbdd22..96370647721 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -153,22 +153,6 @@ __device__ __forceinline__ IndexType disjoint_set_find(IndexType node,
 }
 
 
-template <typename IndexType>
-struct atomic_map {};
-
-
-template <>
-struct atomic_map<int32> {
-    using type = int;
-};
-
-
-template <>
-struct atomic_map<int64> {
-    using type = unsigned long long;
-};
-
-
 template <typename IndexType>
 __global__
 __launch_bounds__(default_block_size) void connected_components_combine(
@@ -176,7 +160,6 @@ __launch_bounds__(default_block_size) void connected_components_combine(
     const IndexType* __restrict__ col_idxs, IndexType num_rows,
     IndexType* __restrict__ parents)
 {
-    using atomic_type = typename atomic_map<IndexType>::type;
     const auto row = thread::get_thread_id_flat<IndexType>();
     if (row >= num_rows) {
         return;
@@ -196,10 +179,8 @@ __launch_bounds__(default_block_size) void connected_components_combine(
                 auto& max_parent = col_parent < parent ? parent : col_parent;
                 // attempt to attach the (assumed unattached) larger node to the
                 // smaller node
-                const auto old_parent = static_cast<IndexType>(atomicCAS(
-                    reinterpret_cast<atomic_type*>(parents + max_parent),
-                    static_cast<atomic_type>(max_parent),
-                    static_cast<atomic_type>(min_parent)));
+                const auto old_parent = atomic_cas_relaxed(
+                    parents + max_parent, max_parent, min_parent);
                 // if unsuccessful, proceed with the parent of the (now known
                 // attached) node
                 if (old_parent != max_parent) {
@@ -354,7 +335,6 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel(
     IndexType level, IndexType* __restrict__ node_levels,
     IndexType* __restrict__ level_nodes, IndexType* __restrict__ output_ptr)
 {
-    using atomic_type = typename atomic_map<IndexType>::type;
     const auto source = thread::get_thread_id_flat();
     if (source >= num_sources) {
         return;
@@ -362,17 +342,13 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel(
     const auto row = sources[source];
     const auto begin = row_ptrs[row];
     const auto end = row_ptrs[row + 1];
-    atomic_type unsigned_unattached{};
     const auto unattached = invalid_index<IndexType>();
-    memcpy(&unsigned_unattached, &unattached, sizeof(IndexType));
     for (auto nz = begin; nz < end; nz++) {
         const auto col = col_idxs[nz];
         if (node_levels[col] == unattached &&
-            atomicCAS(reinterpret_cast<atomic_type*>(node_levels + col),
-                      unsigned_unattached,
-                      static_cast<atomic_type>(level)) == unsigned_unattached) {
-            const auto output_pos =
-                atomicAdd(reinterpret_cast<atomic_type*>(output_ptr), 1);
+            atomic_cas_relaxed(node_levels + col, unattached, level) ==
+                unattached) {
+            const auto output_pos = atomic_add_relaxed(output_ptr, 1);
             level_nodes[output_pos] = col;
         }
     }
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index f5c995a76a2..089cbf5c2dd 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -169,7 +169,6 @@ class operation:
 }}
 """)
             for op in operations:
-
                 if (not t.ptx_type_suffix.startswith(".f") or op.supports_float) and (not t.ptx_type_suffix.startswith(".s") or op.supports_signed):
                     # for some reason there is no signed 64 bit atomic support,
                     # but since the operations are equivalent in two's complement, we can use unsigned instead
@@ -193,6 +192,27 @@ class operation:
         : "memory");
     return result;
 }}
+""")
+            # non-relaxed atomics are unsupported with SM 6.0 and below
+            if o.is_relaxed and not t.ptx_type_suffix.startswith(".f"):
+                new_type = ".b" + t.ptx_type_suffix[2:]
+                print(f"""
+__device__ __forceinline__ {t.name} atomic_cas{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} old_val, {t.name} new_val)
+{{
+    {t.name} result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}.cas{new_type} %0, [%1], %2, %3;"
+#else
+        "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;"
+#endif
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(old_val), "{t.val_constraint}"(new_val)
+        : "memory");
+    return result;
+}}
 """)
 
 # vectorized relaxed loads for thrust::complex
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index d279a8ac9de..a05bf7de344 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -152,6 +152,27 @@ __device__ __forceinline__ ValueType atomic_max_relaxed(ValueType* ptr,
 }
 
 
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_cas_relaxed(ValueType* ptr,
+                                                        ValueType old_val,
+                                                        ValueType new_val)
+{
+    __atomic_compare_exchange_n(ptr, &old_val, new_val, false, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
+    return old_val;
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_cas_relaxed_local(ValueType* ptr,
+                                                              ValueType old_val,
+                                                              ValueType new_val)
+{
+    // no special optimization available for threadblock-local atomic CAS
+    return atomic_cas_relaxed(ptr, old_val, new_val);
+}
+
+
 template <typename ValueType>
 __device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr)
 {