diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp
index 61eeaeeb627..50c86c19bcf 100644
--- a/benchmark/sparse_blas/operations.cpp
+++ b/benchmark/sparse_blas/operations.cpp
@@ -614,8 +614,9 @@ class SymbolicLuNearSymmOperation : public BenchmarkOperation {
 
 class SymbolicCholeskyOperation : public BenchmarkOperation {
 public:
-    explicit SymbolicCholeskyOperation(const Mtx* mtx, bool symmetric)
-        : mtx_{mtx}, symmetric_{symmetric}, result_{}
+    explicit SymbolicCholeskyOperation(const Mtx* mtx, bool device,
+                                       bool symmetric)
+        : mtx_{mtx}, device_{device}, symmetric_{symmetric}, result_{}
     {}
 
     std::pair<bool, double> validate() const override
@@ -643,8 +644,13 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 
     void run() override
     {
-        gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_,
-                                              forest_);
+        if (device_) {
+            gko::factorization::symbolic_cholesky_device(mtx_, symmetric_,
+                                                         result_, forest_);
+        } else {
+            gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_,
+                                                  forest_);
+        }
     }
 
     void write_stats(json& object) override
@@ -654,6 +660,7 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 
 private:
     const Mtx* mtx_;
+    bool device_;
     bool symmetric_;
     std::unique_ptr<Mtx> result_;
     std::unique_ptr<gko::factorization::elimination_forest<itype>> forest_;
@@ -791,13 +798,25 @@ const std::map<std::string,
          [](const Mtx* mtx) {
              return std::make_unique<SymbolicLuNearSymmOperation>(mtx);
          }},
+        {"symbolic_cholesky_device",
+         [](const Mtx* mtx) {
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, true,
+                                                                false);
+         }},
+        {"symbolic_cholesky_device_symmetric",
+         [](const Mtx* mtx) {
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, true,
+                                                                true);
+         }},
         {"symbolic_cholesky",
          [](const Mtx* mtx) {
-             return std::make_unique<SymbolicCholeskyOperation>(mtx, false);
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, false,
+                                                                false);
          }},
         {"symbolic_cholesky_symmetric",
          [](const Mtx* mtx) {
-             return std::make_unique<SymbolicCholeskyOperation>(mtx, true);
+             return std::make_unique<SymbolicCholeskyOperation>(mtx, false,
+                                                                true);
          }},
         {"reorder_rcm",
          [](const Mtx* mtx) {
diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout
index b5fc8998be0..f40bf8814dd 100644
--- a/benchmark/test/reference/sparse_blas.reordered.stdout
+++ b/benchmark/test/reference/sparse_blas.reordered.stdout
@@ -9,7 +9,7 @@
                 "bandwidth": 1.0,
                 "repetitions": 10,
                 "components": {
-                    "compute_elim_forest": 1.0,
+                    "compute_elimination_forest": 1.0,
                     "allocate": 1.0,
                     "free": 1.0,
                     "components::fill_array": 1.0,
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 5cfa55ca687..d7d1de30dea 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -12,6 +12,7 @@ set(CUDA_HIP_SOURCES
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
     factorization/factorization_kernels.cpp
+    factorization/elimination_forest_kernels.cpp
     factorization/ic_kernels.cpp
     factorization/ilu_kernels.cpp
     factorization/lu_kernels.cpp
diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc
index f759c613f45..f77481cd793 100644
--- a/common/cuda_hip/components/memory.nvidia.hpp.inc
+++ b/common/cuda_hip/components/memory.nvidia.hpp.inc
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -96,6 +96,83 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
 }
 
 
+__device__ __forceinline__ int32 atomic_add_relaxed_shared(int32* ptr,
+                                                           int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_min_relaxed_shared(int32* ptr,
+                                                           int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr,
+                                                           int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_cas_relaxed_shared(int32* ptr,
+                                                           int32 old_val,
+                                                           int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
 {
     int64 result;
@@ -127,6 +204,83 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
 }
 
 
+__device__ __forceinline__ int64 atomic_add_relaxed_shared(int64* ptr,
+                                                           int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_min_relaxed_shared(int64* ptr,
+                                                           int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr,
+                                                           int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_cas_relaxed_shared(int64* ptr,
+                                                           int64 old_val,
+                                                           int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ float load_relaxed_shared(const float* ptr)
 {
     float result;
@@ -158,6 +312,25 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result)
 }
 
 
+__device__ __forceinline__ float atomic_add_relaxed_shared(float* ptr,
+                                                           float value)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.f32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.f32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.f32 %0, [%1], %2;"
+#endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "f"(value)
+        : "memory");
+    return result;
+}
+
+
 __device__ __forceinline__ double load_relaxed_shared(const double* ptr)
 {
     double result;
@@ -189,31 +362,49 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result)
 }
 
 
-__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
+__device__ __forceinline__ double atomic_add_relaxed_shared(double* ptr,
+                                                            double value)
 {
-    int32 result;
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.f64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.f64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.f64 %0, [%1], %2;"
+#endif
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "d"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 load_relaxed_shared(const uint32* ptr)
+{
+    uint32 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.shared.s32 %0, [%1];"
+        "ld.volatile.shared.u32 %0, [%1];"
 #else
-        "ld.acquire.cta.shared.s32 %0, [%1];"
+        "ld.relaxed.cta.shared.u32 %0, [%1];"
 #endif
         : "=r"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint32*>(ptr)))
         : "memory");
-    membar_acq_rel_shared();
+
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
+__device__ __forceinline__ void store_relaxed_shared(uint32* ptr, uint32 result)
 {
-    membar_acq_rel_shared();
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.shared.s32 [%0], %1;"
+        "st.volatile.shared.u32 [%0], %1;"
 #else
-        "st.release.cta.shared.s32 [%0], %1;"
+        "st.relaxed.cta.shared.u32 [%0], %1;"
 #endif
         ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
         "r"(result)
@@ -221,31 +412,145 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
 }
 
 
-__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
+__device__ __forceinline__ uint32 atomic_add_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
 {
-    int64 result;
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.add.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_min_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.min.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_max_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.max.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_and_relaxed_shared(uint32* ptr,
+                                                            uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.and.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.and.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.and.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.or.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.or.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.or.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_cas_relaxed_shared(uint32* ptr,
+                                                            uint32 old_val,
+                                                            uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr)
+{
+    uint64 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.shared.s64 %0, [%1];"
+        "ld.volatile.shared.u64 %0, [%1];"
 #else
-        "ld.acquire.cta.shared.s64 %0, [%1];"
+        "ld.relaxed.cta.shared.u64 %0, [%1];"
 #endif
         : "=l"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint64*>(ptr)))
         : "memory");
-    membar_acq_rel_shared();
+
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
+__device__ __forceinline__ void store_relaxed_shared(uint64* ptr, uint64 result)
 {
-    membar_acq_rel_shared();
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.shared.s64 [%0], %1;"
+        "st.volatile.shared.u64 [%0], %1;"
 #else
-        "st.release.cta.shared.s64 [%0], %1;"
+        "st.relaxed.cta.shared.u64 [%0], %1;"
 #endif
         ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
         "l"(result)
@@ -253,140 +558,1329 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
 }
 
 
-__device__ __forceinline__ float load_acquire_shared(const float* ptr)
+__device__ __forceinline__ uint64 atomic_add_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
 {
-    float result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.shared.f32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.shared.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.add.u64 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.shared.f32 %0, [%1];"
+        "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;"
 #endif
-        : "=f"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
         : "memory");
-    membar_acq_rel_shared();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_shared(float* ptr, float result)
+__device__ __forceinline__ uint64 atomic_min_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
 {
-    membar_acq_rel_shared();
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.shared.f32 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.shared.min.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.min.u64 %0, [%1], %2;"
 #else
-        "st.release.cta.shared.f32 [%0], %1;"
+        "atom.relaxed.cta.shared.min.u64 %0, [%1], %2;"
 #endif
-        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
-        "f"(result)
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ double load_acquire_shared(const double* ptr)
+__device__ __forceinline__ uint64 atomic_max_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
 {
-    double result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.shared.f64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.shared.max.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.max.u64 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.shared.f64 %0, [%1];"
+        "atom.relaxed.cta.shared.max.u64 %0, [%1], %2;"
 #endif
-        : "=d"(result)
-        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
         : "memory");
-    membar_acq_rel_shared();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_shared(double* ptr, double result)
+__device__ __forceinline__ uint64 atomic_and_relaxed_shared(uint64* ptr,
+                                                            uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.and.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.and.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.and.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.or.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.or.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.shared.or.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_cas_relaxed_shared(uint64* ptr,
+                                                            uint64 old_val,
+                                                            uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.shared.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.shared.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.s32 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.s32 [%0], %1;"
+#else
+        "st.release.cta.shared.s32 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.s64 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.s64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.s64 [%0], %1;"
+#else
+        "st.release.cta.shared.s64 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ float load_acquire_shared(const float* ptr)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.f32 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.f32 %0, [%1];"
+#endif
+        : "=f"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(float* ptr, float result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.f32 [%0], %1;"
+#else
+        "st.release.cta.shared.f32 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "f"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ double load_acquire_shared(const double* ptr)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.f64 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.f64 %0, [%1];"
+#endif
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(double* ptr, double result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.f64 [%0], %1;"
+#else
+        "st.release.cta.shared.f64 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "d"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint32 load_acquire_shared(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.u32 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint32*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(uint32* ptr, uint32 result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.u32 [%0], %1;"
+#else
+        "st.release.cta.shared.u32 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 load_acquire_shared(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.shared.u64 %0, [%1];"
+#else
+        "ld.acquire.cta.shared.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<uint64*>(ptr)))
+        : "memory");
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(uint64* ptr, uint64 result)
+{
+    membar_acq_rel_shared();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.shared.u64 [%0], %1;"
+#else
+        "st.release.cta.shared.u64 [%0], %1;"
+#endif
+        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s32 %0, [%1];"
+#else
+        "ld.relaxed.cta.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s32 [%0], %1;"
+#else
+        "st.relaxed.cta.s32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 atomic_add_relaxed_local(int32* ptr,
+                                                          int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_min_relaxed_local(int32* ptr,
+                                                          int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr,
+                                                          int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_cas_relaxed_local(int32* ptr,
+                                                          int32 old_val,
+                                                          int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s64 %0, [%1];"
+#else
+        "ld.relaxed.cta.s64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s64 [%0], %1;"
+#else
+        "st.relaxed.cta.s64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int64 atomic_add_relaxed_local(int64* ptr,
+                                                          int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_min_relaxed_local(int64* ptr,
+                                                          int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr,
+                                                          int64 value)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.s64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.s64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 atomic_cas_relaxed_local(int64* ptr,
+                                                          int64 old_val,
+                                                          int64 new_val)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ float load_relaxed_local(const float* ptr)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f32 %0, [%1];"
+#else
+        "ld.relaxed.cta.f32 %0, [%1];"
+#endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f32 [%0], %1;"
+#else
+        "st.relaxed.cta.f32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ float atomic_add_relaxed_local(float* ptr,
+                                                          float value)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.f32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.f32 %0, [%1], %2;"
+#endif
+        : "=f"(result)
+        : "l"(ptr), "f"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ double load_relaxed_local(const double* ptr)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f64 %0, [%1];"
+#else
+        "ld.relaxed.cta.f64 %0, [%1];"
+#endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f64 [%0], %1;"
+#else
+        "st.relaxed.cta.f64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ double atomic_add_relaxed_local(double* ptr,
+                                                           double value)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.f64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.f64 %0, [%1], %2;"
+#endif
+        : "=d"(result)
+        : "l"(ptr), "d"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 load_relaxed_local(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u32 %0, [%1];"
+#else
+        "ld.relaxed.cta.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(uint32* ptr, uint32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u32 [%0], %1;"
+#else
+        "st.relaxed.cta.u32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint32 atomic_add_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_min_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_max_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_and_relaxed_local(uint32* ptr,
+                                                           uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.and.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.and.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.and.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr,
+                                                          uint32 value)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.or.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.or.u32 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.or.u32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 atomic_cas_relaxed_local(uint32* ptr,
+                                                           uint32 old_val,
+                                                           uint32 new_val)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u64 %0, [%1];"
+#else
+        "ld.relaxed.cta.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_local(uint64* ptr, uint64 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u64 [%0], %1;"
+#else
+        "st.relaxed.cta.u64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 atomic_add_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.add.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.add.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_min_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.min.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.min.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_max_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.max.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.max.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_and_relaxed_local(uint64* ptr,
+                                                           uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.and.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.and.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.and.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr,
+                                                          uint64 value)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.or.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.or.u64 %0, [%1], %2;"
+#else
+        "atom.relaxed.cta.or.u64 %0, [%1], %2;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint64 atomic_cas_relaxed_local(uint64* ptr,
+                                                           uint64 old_val,
+                                                           uint64 new_val)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.cta.cas.b64 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;"
+#endif
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s32 %0, [%1];"
+#else
+        "ld.acquire.cta.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s32 [%0], %1;"
+#else
+        "st.release.cta.s32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s64 %0, [%1];"
+#else
+        "ld.acquire.cta.s64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s64 [%0], %1;"
+#else
+        "st.release.cta.s64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ float load_acquire_local(const float* ptr)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f32 %0, [%1];"
+#else
+        "ld.acquire.cta.f32 %0, [%1];"
+#endif
+        : "=f"(result)
+        : "l"(const_cast<float*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(float* ptr, float result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f32 [%0], %1;"
+#else
+        "st.release.cta.f32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "f"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ double load_acquire_local(const double* ptr)
+{
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.f64 %0, [%1];"
+#else
+        "ld.acquire.cta.f64 %0, [%1];"
+#endif
+        : "=d"(result)
+        : "l"(const_cast<double*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(double* ptr, double result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.f64 [%0], %1;"
+#else
+        "st.release.cta.f64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "d"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint32 load_acquire_local(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u32 %0, [%1];"
+#else
+        "ld.acquire.cta.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(uint32* ptr, uint32 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u32 [%0], %1;"
+#else
+        "st.release.cta.u32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 load_acquire_local(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u64 %0, [%1];"
+#else
+        "ld.acquire.cta.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
+        : "memory");
+    membar_acq_rel_local();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_local(uint64* ptr, uint64 result)
+{
+    membar_acq_rel_local();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u64 [%0], %1;"
+#else
+        "st.release.cta.u64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 load_relaxed(const int32* ptr)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s32 %0, [%1];"
+#else
+        "ld.relaxed.gpu.s32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<int32*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
+{
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.s32 [%0], %1;"
+#else
+        "st.relaxed.gpu.s32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ int32 atomic_add_relaxed(int32* ptr, int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.add.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_min_relaxed(int32* ptr, int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.min.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.min.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.max.s32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.s32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.max.s32 %0, [%1], %2;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int32 atomic_cas_relaxed(int32* ptr, int32 old_val,
+                                                    int32 new_val)
+{
+    int32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b32 %0, [%1], %2, %3;"
+#else
+        "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
+#endif
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ int64 load_relaxed(const int64* ptr)
+{
+    int64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.s64 %0, [%1];"
+#else
+        "ld.relaxed.gpu.s64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<int64*>(ptr))
+        : "memory");
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
 {
-    membar_acq_rel_shared();
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.shared.f64 [%0], %1;"
+        "st.volatile.s64 [%0], %1;"
 #else
-        "st.release.cta.shared.f64 [%0], %1;"
+        "st.relaxed.gpu.s64 [%0], %1;"
 #endif
-        ::"r"(convert_generic_ptr_to_smem_ptr(ptr)),
-        "d"(result)
+        ::"l"(ptr),
+        "l"(result)
         : "memory");
 }
 
 
-__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
+__device__ __forceinline__ int64 atomic_add_relaxed(int64* ptr, int64 value)
 {
-    int32 result;
+    int64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.cta.s32 %0, [%1];"
+        "atom.relaxed.gpu.add.u64 %0, [%1], %2;"
 #endif
-        : "=r"(result)
-        : "l"(const_cast<int32*>(ptr))
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
+__device__ __forceinline__ int64 atomic_min_relaxed(int64* ptr, int64 value)
 {
+    int64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.s32 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.min.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.s64 %0, [%1], %2;"
 #else
-        "st.relaxed.cta.s32 [%0], %1;"
+        "atom.relaxed.gpu.min.s64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "r"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
+__device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value)
 {
     int64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.max.s64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.s64 %0, [%1], %2;"
 #else
-        "ld.relaxed.cta.s64 %0, [%1];"
+        "atom.relaxed.gpu.max.s64 %0, [%1], %2;"
 #endif
         : "=l"(result)
-        : "l"(const_cast<int64*>(ptr))
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
+__device__ __forceinline__ int64 atomic_cas_relaxed(int64* ptr, int64 old_val,
+                                                    int64 new_val)
 {
+    int64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.s64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b64 %0, [%1], %2, %3;"
 #else
-        "st.relaxed.cta.s64 [%0], %1;"
+        "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
 #endif
-        ::"l"(ptr),
-        "l"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ float load_relaxed_local(const float* ptr)
+__device__ __forceinline__ float load_relaxed(const float* ptr)
 {
     float result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "ld.volatile.f32 %0, [%1];"
 #else
-        "ld.relaxed.cta.f32 %0, [%1];"
+        "ld.relaxed.gpu.f32 %0, [%1];"
 #endif
         : "=f"(result)
         : "l"(const_cast<float*>(ptr))
@@ -396,13 +1890,13 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr)
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
+__device__ __forceinline__ void store_relaxed(float* ptr, float result)
 {
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "st.volatile.f32 [%0], %1;"
 #else
-        "st.relaxed.cta.f32 [%0], %1;"
+        "st.relaxed.gpu.f32 [%0], %1;"
 #endif
         ::"l"(ptr),
         "f"(result)
@@ -410,14 +1904,32 @@ __device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
 }
 
 
-__device__ __forceinline__ double load_relaxed_local(const double* ptr)
+__device__ __forceinline__ float atomic_add_relaxed(float* ptr, float value)
+{
+    float result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.f32 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.add.f32 %0, [%1], %2;"
+#endif
+        : "=f"(result)
+        : "l"(ptr), "f"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ double load_relaxed(const double* ptr)
 {
     double result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "ld.volatile.f64 %0, [%1];"
 #else
-        "ld.relaxed.cta.f64 %0, [%1];"
+        "ld.relaxed.gpu.f64 %0, [%1];"
 #endif
         : "=d"(result)
         : "l"(const_cast<double*>(ptr))
@@ -427,13 +1939,13 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr)
 }
 
 
-__device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
+__device__ __forceinline__ void store_relaxed(double* ptr, double result)
 {
     asm volatile(
 #if __CUDA_ARCH__ < 700
         "st.volatile.f64 [%0], %1;"
 #else
-        "st.relaxed.cta.f64 [%0], %1;"
+        "st.relaxed.gpu.f64 [%0], %1;"
 #endif
         ::"l"(ptr),
         "d"(result)
@@ -441,31 +1953,48 @@ __device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
 }
 
 
-__device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
+__device__ __forceinline__ double atomic_add_relaxed(double* ptr, double value)
 {
-    int32 result;
+    double result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom.add.f64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.f64 %0, [%1], %2;"
+#else
+        "atom.relaxed.gpu.add.f64 %0, [%1], %2;"
+#endif
+        : "=d"(result)
+        : "l"(ptr), "d"(value)
+        : "memory");
+    return result;
+}
+
+
+__device__ __forceinline__ uint32 load_relaxed(const uint32* ptr)
+{
+    uint32 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.s32 %0, [%1];"
+        "ld.volatile.u32 %0, [%1];"
 #else
-        "ld.acquire.cta.s32 %0, [%1];"
+        "ld.relaxed.gpu.u32 %0, [%1];"
 #endif
         : "=r"(result)
-        : "l"(const_cast<int32*>(ptr))
+        : "l"(const_cast<uint32*>(ptr))
         : "memory");
-    membar_acq_rel_local();
+
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
+__device__ __forceinline__ void store_relaxed(uint32* ptr, uint32 result)
 {
-    membar_acq_rel_local();
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.s32 [%0], %1;"
+        "st.volatile.u32 [%0], %1;"
 #else
-        "st.release.cta.s32 [%0], %1;"
+        "st.relaxed.gpu.u32 [%0], %1;"
 #endif
         ::"l"(ptr),
         "r"(result)
@@ -473,223 +2002,254 @@ __device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
 }
 
 
-__device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
+__device__ __forceinline__ uint32 atomic_add_relaxed(uint32* ptr, uint32 value)
 {
-    int64 result;
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.add.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.u32 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.s64 %0, [%1];"
+        "atom.relaxed.gpu.add.u32 %0, [%1], %2;"
 #endif
-        : "=l"(result)
-        : "l"(const_cast<int64*>(ptr))
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
+__device__ __forceinline__ uint32 atomic_min_relaxed(uint32* ptr, uint32 value)
 {
-    membar_acq_rel_local();
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.s64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.min.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.u32 %0, [%1], %2;"
 #else
-        "st.release.cta.s64 [%0], %1;"
+        "atom.relaxed.gpu.min.u32 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "l"(result)
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ float load_acquire_local(const float* ptr)
+__device__ __forceinline__ uint32 atomic_max_relaxed(uint32* ptr, uint32 value)
 {
-    float result;
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.max.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.u32 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.f32 %0, [%1];"
+        "atom.relaxed.gpu.max.u32 %0, [%1], %2;"
 #endif
-        : "=f"(result)
-        : "l"(const_cast<float*>(ptr))
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(float* ptr, float result)
+__device__ __forceinline__ uint32 atomic_and_relaxed(uint32* ptr, uint32 value)
 {
-    membar_acq_rel_local();
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f32 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.and.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.and.u32 %0, [%1], %2;"
 #else
-        "st.release.cta.f32 [%0], %1;"
+        "atom.relaxed.gpu.and.u32 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "f"(result)
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ double load_acquire_local(const double* ptr)
+__device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value)
 {
-    double result;
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.or.u32 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.or.u32 %0, [%1], %2;"
 #else
-        "ld.acquire.cta.f64 %0, [%1];"
+        "atom.relaxed.gpu.or.u32 %0, [%1], %2;"
 #endif
-        : "=d"(result)
-        : "l"(const_cast<double*>(ptr))
+        : "=r"(result)
+        : "l"(ptr), "r"(value)
         : "memory");
-    membar_acq_rel_local();
     return result;
 }
 
 
-__device__ __forceinline__ void store_release_local(double* ptr, double result)
+__device__ __forceinline__ uint32 atomic_cas_relaxed(uint32* ptr,
+                                                     uint32 old_val,
+                                                     uint32 new_val)
 {
-    membar_acq_rel_local();
+    uint32 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b32 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b32 %0, [%1], %2, %3;"
 #else
-        "st.release.cta.f64 [%0], %1;"
+        "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;"
 #endif
-        ::"l"(ptr),
-        "d"(result)
+        : "=r"(result)
+        : "l"(ptr), "r"(old_val), "r"(new_val)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ int32 load_relaxed(const int32* ptr)
+__device__ __forceinline__ uint64 load_relaxed(const uint64* ptr)
 {
-    int32 result;
+    uint64 result;
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.s32 %0, [%1];"
+        "ld.volatile.u64 %0, [%1];"
 #else
-        "ld.relaxed.gpu.s32 %0, [%1];"
+        "ld.relaxed.gpu.u64 %0, [%1];"
 #endif
-        : "=r"(result)
-        : "l"(const_cast<int32*>(ptr))
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
         : "memory");
 
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
+__device__ __forceinline__ void store_relaxed(uint64* ptr, uint64 result)
 {
     asm volatile(
 #if __CUDA_ARCH__ < 700
-        "st.volatile.s32 [%0], %1;"
+        "st.volatile.u64 [%0], %1;"
 #else
-        "st.relaxed.gpu.s32 [%0], %1;"
+        "st.relaxed.gpu.u64 [%0], %1;"
 #endif
         ::"l"(ptr),
-        "r"(result)
+        "l"(result)
         : "memory");
 }
 
 
-__device__ __forceinline__ int64 load_relaxed(const int64* ptr)
+__device__ __forceinline__ uint64 atomic_add_relaxed(uint64* ptr, uint64 value)
 {
-    int64 result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.s64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.add.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.add.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.gpu.s64 %0, [%1];"
+        "atom.relaxed.gpu.add.u64 %0, [%1], %2;"
 #endif
         : "=l"(result)
-        : "l"(const_cast<int64*>(ptr))
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
+__device__ __forceinline__ uint64 atomic_min_relaxed(uint64* ptr, uint64 value)
 {
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.s64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.min.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.min.u64 %0, [%1], %2;"
 #else
-        "st.relaxed.gpu.s64 [%0], %1;"
+        "atom.relaxed.gpu.min.u64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "l"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ float load_relaxed(const float* ptr)
+__device__ __forceinline__ uint64 atomic_max_relaxed(uint64* ptr, uint64 value)
 {
-    float result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f32 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.max.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.max.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.gpu.f32 %0, [%1];"
+        "atom.relaxed.gpu.max.u64 %0, [%1], %2;"
 #endif
-        : "=f"(result)
-        : "l"(const_cast<float*>(ptr))
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(float* ptr, float result)
+__device__ __forceinline__ uint64 atomic_and_relaxed(uint64* ptr, uint64 value)
 {
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f32 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.and.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.and.u64 %0, [%1], %2;"
 #else
-        "st.relaxed.gpu.f32 [%0], %1;"
+        "atom.relaxed.gpu.and.u64 %0, [%1], %2;"
 #endif
-        ::"l"(ptr),
-        "f"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
+    return result;
 }
 
 
-__device__ __forceinline__ double load_relaxed(const double* ptr)
+__device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value)
 {
-    double result;
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "ld.volatile.f64 %0, [%1];"
+#if __CUDA_ARCH__ < 600
+        "atom.or.u64 %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.or.u64 %0, [%1], %2;"
 #else
-        "ld.relaxed.gpu.f64 %0, [%1];"
+        "atom.relaxed.gpu.or.u64 %0, [%1], %2;"
 #endif
-        : "=d"(result)
-        : "l"(const_cast<double*>(ptr))
+        : "=l"(result)
+        : "l"(ptr), "l"(value)
         : "memory");
-
     return result;
 }
 
 
-__device__ __forceinline__ void store_relaxed(double* ptr, double result)
+__device__ __forceinline__ uint64 atomic_cas_relaxed(uint64* ptr,
+                                                     uint64 old_val,
+                                                     uint64 new_val)
 {
+    uint64 result;
     asm volatile(
-#if __CUDA_ARCH__ < 700
-        "st.volatile.f64 [%0], %1;"
+#if __CUDA_ARCH__ < 600
+        "atom.cas.b64 %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom.gpu.cas.b64 %0, [%1], %2, %3;"
 #else
-        "st.relaxed.gpu.f64 [%0], %1;"
+        "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;"
 #endif
-        ::"l"(ptr),
-        "d"(result)
+        : "=l"(result)
+        : "l"(ptr), "l"(old_val), "l"(new_val)
         : "memory");
+    return result;
 }
 
 
@@ -821,6 +2381,70 @@ __device__ __forceinline__ void store_release(double* ptr, double result)
 }
 
 
+__device__ __forceinline__ uint32 load_acquire(const uint32* ptr)
+{
+    uint32 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u32 %0, [%1];"
+#else
+        "ld.acquire.gpu.u32 %0, [%1];"
+#endif
+        : "=r"(result)
+        : "l"(const_cast<uint32*>(ptr))
+        : "memory");
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(uint32* ptr, uint32 result)
+{
+    membar_acq_rel();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u32 [%0], %1;"
+#else
+        "st.release.gpu.u32 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "r"(result)
+        : "memory");
+}
+
+
+__device__ __forceinline__ uint64 load_acquire(const uint64* ptr)
+{
+    uint64 result;
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "ld.volatile.u64 %0, [%1];"
+#else
+        "ld.acquire.gpu.u64 %0, [%1];"
+#endif
+        : "=l"(result)
+        : "l"(const_cast<uint64*>(ptr))
+        : "memory");
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(uint64* ptr, uint64 result)
+{
+    membar_acq_rel();
+    asm volatile(
+#if __CUDA_ARCH__ < 700
+        "st.volatile.u64 [%0], %1;"
+#else
+        "st.release.gpu.u64 [%0], %1;"
+#endif
+        ::"l"(ptr),
+        "l"(result)
+        : "memory");
+}
+
+
 __device__ __forceinline__ thrust::complex<float> load_relaxed_shared(
     const thrust::complex<float>* ptr)
 {
@@ -1037,14 +2661,14 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.relaxed.cta.b16 t, [%1];\n\t"
+        "  ld.relaxed.cta.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -1057,13 +2681,13 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr)
 __device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result)
 {
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.relaxed.cta.b16 [%0], t;\n\t"
+        "  st.relaxed.cta.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -1075,14 +2699,14 @@ __device__ __forceinline__ __half load_acquire_local(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.acquire.cta.b16 t, [%1];\n\t"
+        "  ld.acquire.cta.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -1096,13 +2720,13 @@ __device__ __forceinline__ void store_release_local(__half* ptr, __half result)
 {
     membar_acq_rel_local();
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.release.cta.b16 [%0], t;\n\t"
+        "  st.release.cta.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -1114,14 +2738,14 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.relaxed.gpu.b16 t, [%1];\n\t"
+        "  ld.relaxed.gpu.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -1134,13 +2758,13 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr)
 __device__ __forceinline__ void store_relaxed(__half* ptr, __half result)
 {
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.relaxed.gpu.b16 [%0], t;\n\t"
+        "  st.relaxed.gpu.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -1152,14 +2776,14 @@ __device__ __forceinline__ __half load_acquire(const __half* ptr)
 {
     float result;
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
+        "{"
+        "  .reg .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "  ld.volatile.b16 t, [%1];\n\t"
+        "  ld.volatile.b16 t, [%1];"
 #else
-        "  ld.acquire.gpu.b16 t, [%1];\n\t"
+        "  ld.acquire.gpu.b16 t, [%1];"
 #endif
-        "  cvt.f32.f16 %0, t;\n\t"
+        "  cvt.f32.f16 %0, t;"
         "}"
         : "=f"(result)
         : "l"(const_cast<__half*>(ptr))
@@ -1173,13 +2797,13 @@ __device__ __forceinline__ void store_release(__half* ptr, __half result)
 {
     membar_acq_rel();
     asm volatile(
-        "{\n\t"
-        "  .reg .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t, %1;\n\t"
+        "{"
+        "  .reg .f16 t;"
+        "  cvt.rn.f16.f32 t, %1;"
 #if __CUDA_ARCH__ < 700
-        "  st.volatile.b16 [%0], t;\n\t"
+        "  st.volatile.b16 [%0], t;"
 #else
-        "  st.release.gpu.b16 [%0], t;\n\t"
+        "  st.release.gpu.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(static_cast<float>(result))
@@ -1193,15 +2817,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed_local(
     float real_result;
     float imag_result;
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.volatile.v2.b16 {t.x, t.y}, [%2];"
 #else
-        "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];"
 #endif
-        "  cvt.f32.f16 %0, t.x;\n\t"
-        "  cvt.f32.f16 %1, t.y;\n\t"
+        "  cvt.f32.f16 %0, t.x;"
+        "  cvt.f32.f16 %1, t.y;"
         "}"
         : "=f"(real_result), "=f"(imag_result)
         : "l"(const_cast<thrust::complex<__half>*>(ptr))
@@ -1216,14 +2840,14 @@ __device__ __forceinline__ void store_relaxed_local(
     auto real_result = static_cast<float>(result.real());
     auto imag_result = static_cast<float>(result.imag());
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t.x, %1;\n\t"
-        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
+        "  cvt.rn.f16.f32 t.x, %1;"
+        "  cvt.rn.f16.f32 t.y, %2;"
 #if __CUDA_ARCH__ < 700
-        "st.volatile.v2.b16 [%0], t;\n\t"
+        "st.volatile.v2.b16 [%0], t;"
 #else
-        "st.relaxed.cta.v2.b16 [%0], t;\n\t"
+        "st.relaxed.cta.v2.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(real_result), "f"(imag_result)
@@ -1237,15 +2861,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed(
     float real_result;
     float imag_result;
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
 #if __CUDA_ARCH__ < 700
-        "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.volatile.v2.b16 {t.x, t.y}, [%2];"
 #else
-        "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t"
+        "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];"
 #endif
-        "  cvt.f32.f16 %0, t.x;\n\t"
-        "  cvt.f32.f16 %1, t.y;\n\t"
+        "  cvt.f32.f16 %0, t.x;"
+        "  cvt.f32.f16 %1, t.y;"
         "}"
         : "=f"(real_result), "=f"(imag_result)
         : "l"(const_cast<thrust::complex<__half>*>(ptr))
@@ -1260,14 +2884,14 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr,
     auto real_result = static_cast<float>(result.real());
     auto imag_result = static_cast<float>(result.imag());
     asm volatile(
-        "{\n\t"
-        "  .reg .v2 .f16 t;\n\t"
-        "  cvt.rn.f16.f32 t.x, %1;\n\t"
-        "  cvt.rn.f16.f32 t.y, %2;\n\t"
+        "{"
+        "  .reg .v2 .f16 t;"
+        "  cvt.rn.f16.f32 t.x, %1;"
+        "  cvt.rn.f16.f32 t.y, %2;"
 #if __CUDA_ARCH__ < 700
-        "st.volatile.v2.b16 [%0], t;\n\t"
+        "st.volatile.v2.b16 [%0], t;"
 #else
-        "st.relaxed.gpu.v2.b16 [%0], t;\n\t"
+        "st.relaxed.gpu.v2.b16 [%0], t;"
 #endif
         "}" ::"l"(ptr),
         "f"(real_result), "f"(imag_result)
diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp
index 7ff1382d8c6..ce4dc823b98 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.cpp
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -1,10 +1,11 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "core/factorization/cholesky_kernels.hpp"
 
 #include <algorithm>
+#include <limits>
 #include <memory>
 
 #include <thrust/execution_policy.h>
@@ -21,6 +22,7 @@
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
 #include "common/cuda_hip/components/syncfree.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
@@ -45,9 +47,6 @@ namespace cholesky {
 constexpr int default_block_size = 512;
 
 
-#include "core/factorization/elimination_forest.hpp"
-
-
 namespace kernel {
 
 
@@ -266,65 +265,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename IndexType>
-void build_children_from_parents(
-    std::shared_ptr<const DefaultExecutor> exec,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto num_rows = forest.parents.get_size();
-    // build COO representation of the tree
-    array<IndexType> col_idx_array{exec, num_rows};
-    const auto col_idxs = col_idx_array.get_data();
-    const auto parents = forest.parents.get_const_data();
-    const auto children = forest.children.get_data();
-    const auto child_ptrs = forest.child_ptrs.get_data();
-    exec->copy(num_rows, parents, col_idxs);
-    thrust::sequence(thrust_policy(exec), children, children + num_rows,
-                     IndexType{});
-    // group by parent
-    thrust::stable_sort_by_key(thrust_policy(exec), col_idxs,
-                               col_idxs + num_rows, children);
-    // create child pointers for groups of children
-    components::convert_idxs_to_ptrs(exec, col_idxs, num_rows,
-                                     num_rows + 1,  // rows plus sentinel
-                                     child_ptrs);
-}
-
-
-template <typename ValueType, typename IndexType>
-void forest_from_factor(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* factors,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto num_rows = factors->get_size()[0];
-    const auto it = thrust::make_counting_iterator(IndexType{});
-    thrust::transform(
-        thrust_policy(exec), it, it + num_rows, forest.parents.get_data(),
-        [row_ptrs = factors->get_const_row_ptrs(),
-         col_idxs = factors->get_const_col_idxs(),
-         num_rows] __device__(IndexType l_col) {
-            const auto llt_row_begin = row_ptrs[l_col];
-            const auto llt_row_end = row_ptrs[l_col + 1];
-            for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
-                const auto l_row = col_idxs[nz];
-                // parent[j] = min(i | i > j and l_ij =/= 0)
-                // we read from L^T stored above the diagonal in factors
-                // assuming a sorted order of the columns
-                if (l_row > l_col) {
-                    return l_row;
-                }
-            }
-            // sentinel pseudo-root
-            return static_cast<IndexType>(num_rows);
-        });
-    build_children_from_parents(exec, forest);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
new file mode 100644
index 00000000000..52002e9a211
--- /dev/null
+++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp
@@ -0,0 +1,371 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace elimination_forest {
+
+
+constexpr int default_block_size = 512;
+
+
+namespace kernel {
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist(
+    const IndexType* __restrict__ rows, const IndexType* __restrict__ cols,
+    IndexType size, IndexType* __restrict__ worklist_sources,
+    IndexType* __restrict__ worklist_targets,
+    IndexType* __restrict__ worklist_edge_ids,
+    IndexType* __restrict__ worklist_counter)
+{
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto row = rows[i];
+    const auto col = cols[i];
+    if (col < row) {
+        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
+        worklist_sources[out_i] = row;
+        worklist_targets[out_i] = col;
+        worklist_edge_ids[out_i] = i;
+    }
+}
+
+
+template <typename IndexType>
+__device__ IndexType mst_find(const IndexType* parents, IndexType node)
+{
+    auto parent = parents[node];
+    while (parent != node) {
+        node = parent;
+        parent = parents[node];
+    };
+    return parent;
+}
+
+
+template <typename IndexType>
+__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node)
+{
+    auto parent = load_relaxed_local(parents + node);
+    while (parent != node) {
+        node = parent;
+        parent = load_relaxed_local(parents + node);
+    };
+    return parent;
+}
+
+
+template <typename IndexType>
+__device__ void guarded_atomic_min(IndexType* ptr, IndexType value)
+{
+    // only execute the atomic if we know that it might have an effect
+    if (load_relaxed_local(ptr) > value) {
+        atomic_min_relaxed(ptr, value);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_find_minimum(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets,
+    const IndexType* __restrict__ in_edge_ids, IndexType size,
+    const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge,
+    IndexType* __restrict__ worklist_sources,
+    IndexType* __restrict__ worklist_targets,
+    IndexType* __restrict__ worklist_edge_ids,
+    IndexType* __restrict__ worklist_counter)
+{
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    const auto edge_id = in_edge_ids[i];
+    const auto source_rep = mst_find(parents, source);
+    const auto target_rep = mst_find(parents, target);
+    if (source_rep != target_rep) {
+        const auto out_i = atomic_add_relaxed(worklist_counter, 1);
+        worklist_sources[out_i] = source_rep;
+        worklist_targets[out_i] = target_rep;
+        worklist_edge_ids[out_i] = edge_id;
+        guarded_atomic_min(min_edge + source_rep, edge_id);
+        guarded_atomic_min(min_edge + target_rep, edge_id);
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_join_edges(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets,
+    const IndexType* __restrict__ in_edge_ids, IndexType size,
+    IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge,
+    const IndexType* __restrict__ edge_sources,
+    const IndexType* __restrict__ edge_targets,
+    IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets,
+    IndexType* __restrict__ out_counter)
+{
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    const auto edge_id = in_edge_ids[i];
+    if (min_edge[source] == edge_id || min_edge[target] == edge_id) {
+        // join source and sink
+        const auto source_rep = mst_find_relaxed(parents, source);
+        const auto target_rep = mst_find_relaxed(parents, target);
+        assert(source_rep != target_rep);
+        const auto new_rep = min(source_rep, target_rep);
+        auto old_rep = max(source_rep, target_rep);
+        bool repeat = false;
+        do {
+            repeat = false;
+            auto old_parent =
+                atomic_cas_relaxed(parents + old_rep, old_rep, new_rep);
+            // if this fails, the parent of old_rep changed recently, so we need
+            // to try again by updating the parent's parent (hopefully its rep)
+            if (old_parent != old_rep) {
+                old_rep = old_parent;
+                repeat = true;
+            }
+        } while (repeat);
+        const auto out_i = atomic_add_relaxed(out_counter, 1);
+        out_sources[out_i] = edge_sources[edge_id];
+        out_targets[out_i] = edge_targets[edge_id];
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges(
+    const IndexType* __restrict__ in_sources,
+    const IndexType* __restrict__ in_targets, IndexType size,
+    IndexType* __restrict__ min_edge)
+{
+    constexpr auto sentinel = std::numeric_limits<IndexType>::max();
+    const auto i = thread::get_thread_id_flat<IndexType>();
+    if (i >= size) {
+        return;
+    }
+    const auto source = in_sources[i];
+    const auto target = in_targets[i];
+    // we could write the values non-atomically, but this makes race checkers
+    // happier without a performance penalty (hopefully, thanks to _local)
+    store_relaxed_local(min_edge + source, sentinel);
+    store_relaxed_local(min_edge + target, sentinel);
+}
+
+
+}  // namespace kernel
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    // This is a minimum spanning tree algorithm implementation based on
+    // A. Fallin, A. Gonzalez, J. Seo, and M. Burtscher,
+    // "A High-Performance MST Implementation for GPUs,”
+    // doi: 10.1145/3581784.3607093
+    // we don't filter heavy edges since the heaviest edges are necessary to
+    // reach the last node and we don't need to sort since the COO format
+    // already sorts by row index.
+    const auto policy = thrust_policy(exec);
+    const auto nnz = exec->copy_val_to_host(row_ptrs + size);
+    // convert edges to COO representation
+    // the edge list is sorted, since we only consider edges where row > col,
+    // and the row array (= weights) is sorted coming from row_ptrs
+    array<IndexType> row_array{exec, static_cast<size_type>(nnz)};
+    const auto rows = row_array.get_data();
+    components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows);
+    // we assume the matrix is symmetric, so we can remove every second edge
+    // also round up the worklist size for equal cache alignment between fields
+    const auto worklist_size =
+        ceildiv(nnz, config::warp_size * 2) * config::warp_size;
+    // create 2 worklists consisting of (start, end, edge_id)
+    array<IndexType> worklist{exec, static_cast<size_type>(worklist_size * 6)};
+    auto wl1_source = worklist.get_data();
+    auto wl1_target = wl1_source + worklist_size;
+    auto wl1_edge_id = wl1_target + worklist_size;
+    auto wl2_source = wl1_source + 3 * worklist_size;
+    auto wl2_target = wl1_target + 3 * worklist_size;
+    auto wl2_edge_id = wl1_edge_id + 3 * worklist_size;
+    // atomic counters for worklists and output edge list
+    array<IndexType> counters{exec, 3};
+    auto wl1_counter = counters.get_data();
+    auto wl2_counter = wl1_counter + 1;
+    auto output_counter = wl2_counter + 1;
+    components::fill_array(exec, counters.get_data(), counters.get_size(),
+                           IndexType{});
+    // helpers for interacting with worklists
+    const auto clear_wl1 = [&] {
+        IndexType value{};
+        exec->copy_from(exec->get_master(), 1, &value, wl1_counter);
+    };
+    const auto clear_wl2 = [&] {
+        IndexType value{};
+        exec->copy_from(exec->get_master(), 1, &value, wl2_counter);
+    };
+    const auto get_wl1_size = [&] {
+        return exec->copy_val_to_host(wl1_counter);
+    };
+    const auto swap_wl1_wl2 = [&] {
+        std::swap(wl1_source, wl2_source);
+        std::swap(wl1_target, wl2_target);
+        std::swap(wl1_edge_id, wl2_edge_id);
+        std::swap(wl1_counter, wl2_counter);
+    };
+    // initialize every node to a singleton set
+    array<IndexType> parent_array{exec, size};
+    const auto parents = parent_array.get_data();
+    components::fill_seq_array(exec, parents, size);
+    // array storing the minimal edge adjacent to each node
+    array<IndexType> min_edge_array{exec, size};
+    const auto min_edges = min_edge_array.get_data();
+    constexpr auto min_edge_sentinel = std::numeric_limits<IndexType>::max();
+    components::fill_array(exec, min_edges, size, min_edge_sentinel);
+    // output row array, to be used in conjunction with out_cols in COO storage
+    array<IndexType> out_row_array{exec, size};
+    const auto out_rows = out_row_array.get_data();
+    // initialize worklist1 with forward edges
+    {
+        const auto num_blocks = ceildiv(nnz, default_block_size);
+        kernel::mst_initialize_worklist<<<num_blocks, default_block_size>>>(
+            rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter);
+    }
+    auto wl1_size = get_wl1_size();
+    while (wl1_size > 0) {
+        clear_wl2();
+        // attach each node to its smallest adjacent non-cycle edge
+        {
+            const auto num_blocks = ceildiv(wl1_size, default_block_size);
+            kernel::mst_find_minimum<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
+                min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter);
+        }
+        clear_wl1();
+        swap_wl1_wl2();
+        wl1_size = get_wl1_size();
+        if (wl1_size > 0) {
+            // join minimal edges
+            const auto num_blocks = ceildiv(wl1_size, default_block_size);
+            kernel::mst_join_edges<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_edge_id, wl1_size, parents,
+                min_edges, rows, cols, out_rows, out_cols, output_counter);
+            kernel::mst_reset_min_edges<<<num_blocks, default_block_size>>>(
+                wl1_source, wl1_target, wl1_size, min_edges);
+        }
+    }
+    const auto num_mst_edges = exec->copy_val_to_host(output_counter);
+    // two separate sort calls get turned into efficient RadixSort invocations
+    thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows);
+    thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges,
+                               out_cols);
+    components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename IndexType>
+void build_children_from_parents(
+    std::shared_ptr<const DefaultExecutor> exec,
+    gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto num_rows = forest.parents.get_size();
+    // build COO representation of the tree
+    array<IndexType> col_idx_array{exec, num_rows};
+    const auto col_idxs = col_idx_array.get_data();
+    const auto parents = forest.parents.get_const_data();
+    const auto children = forest.children.get_data();
+    const auto child_ptrs = forest.child_ptrs.get_data();
+    exec->copy(num_rows, parents, col_idxs);
+    thrust::sequence(thrust_policy(exec), children, children + num_rows,
+                     IndexType{});
+    // group by parent
+    thrust::stable_sort_by_key(thrust_policy(exec), col_idxs,
+                               col_idxs + num_rows, children);
+    // create child pointers for groups of children
+    components::convert_idxs_to_ptrs(exec, col_idxs, num_rows,
+                                     num_rows + 1,  // rows plus sentinel
+                                     child_ptrs);
+}
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto num_rows = factors->get_size()[0];
+    const auto it = thrust::make_counting_iterator(IndexType{});
+    thrust::transform(
+        thrust_policy(exec), it, it + num_rows, forest.parents.get_data(),
+        [row_ptrs = factors->get_const_row_ptrs(),
+         col_idxs = factors->get_const_col_idxs(),
+         num_rows] __device__(IndexType l_col) {
+            const auto llt_row_begin = row_ptrs[l_col];
+            const auto llt_row_end = row_ptrs[l_col + 1];
+            for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
+                const auto l_row = col_idxs[nz];
+                // parent[j] = min(i | i > j and l_ij =/= 0)
+                // we read from L^T stored above the diagonal in factors
+                // assuming a sorted order of the columns
+                if (l_row > l_col) {
+                    return l_row;
+                }
+            }
+            // sentinel pseudo-root
+            return static_cast<IndexType>(num_rows);
+        });
+    build_children_from_parents(exec, forest);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp
index bd2423d4306..8b3167eca97 100644
--- a/common/cuda_hip/matrix/csr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/csr_kernels.template.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -35,6 +35,7 @@
 #include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/format_conversion.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/merging.hpp"
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
@@ -1293,7 +1294,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup(
         }
 #else
         if (i < row_len) {
-            while (atomicCAS(local_storage + hash, empty, i) != empty) {
+            while (atomic_cas_relaxed_local(local_storage + hash, empty,
+                                            static_cast<int32>(i)) != empty) {
                 hash++;
                 if (hash >= available_storage) {
                     hash = 0;
diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp
index 2bb18cbdd22..96370647721 100644
--- a/common/cuda_hip/reorder/rcm_kernels.cpp
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -153,22 +153,6 @@ __device__ __forceinline__ IndexType disjoint_set_find(IndexType node,
 }
 
 
-template <typename IndexType>
-struct atomic_map {};
-
-
-template <>
-struct atomic_map<int32> {
-    using type = int;
-};
-
-
-template <>
-struct atomic_map<int64> {
-    using type = unsigned long long;
-};
-
-
 template <typename IndexType>
 __global__
 __launch_bounds__(default_block_size) void connected_components_combine(
@@ -176,7 +160,6 @@ __launch_bounds__(default_block_size) void connected_components_combine(
     const IndexType* __restrict__ col_idxs, IndexType num_rows,
     IndexType* __restrict__ parents)
 {
-    using atomic_type = typename atomic_map<IndexType>::type;
     const auto row = thread::get_thread_id_flat<IndexType>();
     if (row >= num_rows) {
         return;
@@ -196,10 +179,8 @@ __launch_bounds__(default_block_size) void connected_components_combine(
                 auto& max_parent = col_parent < parent ? parent : col_parent;
                 // attempt to attach the (assumed unattached) larger node to the
                 // smaller node
-                const auto old_parent = static_cast<IndexType>(atomicCAS(
-                    reinterpret_cast<atomic_type*>(parents + max_parent),
-                    static_cast<atomic_type>(max_parent),
-                    static_cast<atomic_type>(min_parent)));
+                const auto old_parent = atomic_cas_relaxed(
+                    parents + max_parent, max_parent, min_parent);
                 // if unsuccessful, proceed with the parent of the (now known
                 // attached) node
                 if (old_parent != max_parent) {
@@ -354,7 +335,6 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel(
     IndexType level, IndexType* __restrict__ node_levels,
     IndexType* __restrict__ level_nodes, IndexType* __restrict__ output_ptr)
 {
-    using atomic_type = typename atomic_map<IndexType>::type;
     const auto source = thread::get_thread_id_flat();
     if (source >= num_sources) {
         return;
@@ -362,17 +342,13 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel(
     const auto row = sources[source];
     const auto begin = row_ptrs[row];
     const auto end = row_ptrs[row + 1];
-    atomic_type unsigned_unattached{};
     const auto unattached = invalid_index<IndexType>();
-    memcpy(&unsigned_unattached, &unattached, sizeof(IndexType));
     for (auto nz = begin; nz < end; nz++) {
         const auto col = col_idxs[nz];
         if (node_levels[col] == unattached &&
-            atomicCAS(reinterpret_cast<atomic_type*>(node_levels + col),
-                      unsigned_unattached,
-                      static_cast<atomic_type>(level)) == unsigned_unattached) {
-            const auto output_pos =
-                atomicAdd(reinterpret_cast<atomic_type*>(output_ptr), 1);
+            atomic_cas_relaxed(node_levels + col, unattached, level) ==
+                unattached) {
+            const auto output_pos = atomic_add_relaxed(output_ptr, 1);
             level_nodes[output_pos] = col;
         }
     }
diff --git a/core/components/disjoint_sets.hpp b/core/components/disjoint_sets.hpp
index 601817e69d3..77a9177d172 100644
--- a/core/components/disjoint_sets.hpp
+++ b/core/components/disjoint_sets.hpp
@@ -1,12 +1,11 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_
 #define GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_
 
-
-#include <ginkgo/core/base/array.hpp>
+#include "core/base/allocator.hpp"
 
 
 namespace gko {
@@ -31,10 +30,8 @@ class disjoint_sets {
      *              allocate storage.
      */
     explicit disjoint_sets(std::shared_ptr<const Executor> exec, IndexType size)
-        : parents_{exec->get_master(), static_cast<size_type>(size)}
-    {
-        parents_.fill(-1);
-    }
+        : parents_{static_cast<size_type>(size), -1, exec->get_master()}
+    {}
 
     /**
      * Returns true if and only if the element x is the representative of its
@@ -138,17 +135,14 @@ class disjoint_sets {
      *
      * @return the number of elements represented in this data structure.
      */
-    IndexType get_size() const { return parents_.get_size(); }
+    IndexType get_size() const { return parents_.size(); }
 
 private:
-    const IndexType& parent(IndexType x) const
-    {
-        return parents_.get_const_data()[x];
-    }
+    const IndexType& parent(IndexType x) const { return parents_[x]; }
 
-    IndexType& parent(IndexType x) { return parents_.get_data()[x]; }
+    IndexType& parent(IndexType x) { return parents_[x]; }
 
-    array<IndexType> parents_;
+    vector<IndexType> parents_;
 };
 
 
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 034bc73e388..437e1145c40 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -23,6 +23,7 @@
 #include "core/distributed/partition_kernels.hpp"
 #include "core/distributed/vector_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ic_kernels.hpp"
 #include "core/factorization/ilu_kernels.hpp"
@@ -929,7 +930,6 @@ namespace cholesky {
 
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 
@@ -937,6 +937,15 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
 }  // namespace cholesky
 
 
+namespace elimination_forest {
+
+
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+}  // namespace elimination_forest
+
+
 namespace factorization {
 
 
diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp
index ce189e3bbdc..2f8d8630a1e 100644
--- a/core/factorization/cholesky.cpp
+++ b/core/factorization/cholesky.cpp
@@ -15,6 +15,7 @@
 #include "core/config/config_helper.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -27,7 +28,7 @@ namespace {
 
 
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
-GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor);
+GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor);
 GKO_REGISTER_OPERATION(initialize, cholesky::initialize);
 GKO_REGISTER_OPERATION(factorize, cholesky::factorize);
 
@@ -102,7 +103,7 @@ std::unique_ptr<LinOp> Cholesky<ValueType, IndexType>::generate_impl(
         forest =
             std::make_unique<gko::factorization::elimination_forest<IndexType>>(
                 exec, num_rows);
-        exec->run(make_forest_from_factor(factors.get(), *forest));
+        exec->run(make_from_factor(factors.get(), *forest));
     }
     // setup lookup structure on factors
     const auto lookup = matrix::csr::build_lookup(factors.get());
diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp
index 8230fa4552d..fa87edd7fe3 100644
--- a/core/factorization/cholesky_kernels.hpp
+++ b/core/factorization/cholesky_kernels.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -37,13 +37,6 @@ namespace kernels {
         const array<IndexType>& tmp_storage)
 
 
-#define GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType) \
-    void forest_from_factor(                                          \
-        std::shared_ptr<const DefaultExecutor> exec,                  \
-        const matrix::Csr<ValueType, IndexType>* factors,             \
-        gko::factorization::elimination_forest<IndexType>& forest)
-
-
 #define GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType)                 \
     void initialize(std::shared_ptr<const DefaultExecutor> exec,              \
                     const matrix::Csr<ValueType, IndexType>* mtx,             \
@@ -71,8 +64,6 @@ namespace kernels {
     template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>              \
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType); \
-    template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType);         \
     template <typename ValueType, typename IndexType>              \
     GKO_DECLARE_CHOLESKY_FACTORIZE(ValueType, IndexType)
diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp
index 1dc8ff060a0..89bd9505f6d 100644
--- a/core/factorization/elimination_forest.cpp
+++ b/core/factorization/elimination_forest.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -13,10 +13,9 @@ namespace {
 
 
 template <typename IndexType>
-void compute_elim_forest_parent_impl(std::shared_ptr<const Executor> host_exec,
-                                     const IndexType* row_ptrs,
-                                     const IndexType* cols, IndexType num_rows,
-                                     IndexType* parent)
+void compute_elimination_forest_parent_impl(
+    std::shared_ptr<const Executor> host_exec, const IndexType* row_ptrs,
+    const IndexType* cols, IndexType num_rows, IndexType* parent)
 {
     disjoint_sets<IndexType> subtrees{host_exec, num_rows};
     array<IndexType> subtree_root_array{host_exec,
@@ -50,8 +49,10 @@ void compute_elim_forest_parent_impl(std::shared_ptr<const Executor> host_exec,
 
 
 template <typename IndexType>
-void compute_elim_forest_children_impl(const IndexType* parent, IndexType size,
-                                       IndexType* child_ptr, IndexType* child)
+void compute_elimination_forest_children_impl(const IndexType* parent,
+                                              IndexType size,
+                                              IndexType* child_ptr,
+                                              IndexType* child)
 {
     // count how many times each parent occurs, excluding pseudo-root at
     // parent == size
@@ -74,7 +75,7 @@ void compute_elim_forest_children_impl(const IndexType* parent, IndexType size,
 
 
 template <typename IndexType>
-void compute_elim_forest_postorder_impl(
+void compute_elimination_forest_postorder_impl(
     std::shared_ptr<const Executor> host_exec, const IndexType* parent,
     const IndexType* child_ptr, const IndexType* child, IndexType size,
     IndexType* postorder, IndexType* inv_postorder)
@@ -111,10 +112,9 @@ void compute_elim_forest_postorder_impl(
 
 
 template <typename IndexType>
-void compute_elim_forest_postorder_parent_impl(const IndexType* parent,
-                                               const IndexType* inv_postorder,
-                                               IndexType size,
-                                               IndexType* postorder_parent)
+void compute_elimination_forest_postorder_parent_impl(
+    const IndexType* parent, const IndexType* inv_postorder, IndexType size,
+    IndexType* postorder_parent)
 {
     for (IndexType row = 0; row < size; row++) {
         postorder_parent[inv_postorder[row]] =
@@ -140,26 +140,27 @@ void elimination_forest<IndexType>::set_executor(
 
 
 template <typename ValueType, typename IndexType>
-void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
-                         std::unique_ptr<elimination_forest<IndexType>>& forest)
+void compute_elimination_forest(
+    const matrix::Csr<ValueType, IndexType>* mtx,
+    std::unique_ptr<elimination_forest<IndexType>>& forest)
 {
     const auto host_exec = mtx->get_executor()->get_master();
     const auto host_mtx = make_temporary_clone(host_exec, mtx);
     const auto num_rows = static_cast<IndexType>(host_mtx->get_size()[0]);
     forest =
         std::make_unique<elimination_forest<IndexType>>(host_exec, num_rows);
-    compute_elim_forest_parent_impl(host_exec, host_mtx->get_const_row_ptrs(),
-                                    host_mtx->get_const_col_idxs(), num_rows,
-                                    forest->parents.get_data());
-    compute_elim_forest_children_impl(forest->parents.get_const_data(),
-                                      num_rows, forest->child_ptrs.get_data(),
-                                      forest->children.get_data());
-    compute_elim_forest_postorder_impl(
+    compute_elimination_forest_parent_impl(
+        host_exec, host_mtx->get_const_row_ptrs(),
+        host_mtx->get_const_col_idxs(), num_rows, forest->parents.get_data());
+    compute_elimination_forest_children_impl(
+        forest->parents.get_const_data(), num_rows,
+        forest->child_ptrs.get_data(), forest->children.get_data());
+    compute_elimination_forest_postorder_impl(
         host_exec, forest->parents.get_const_data(),
         forest->child_ptrs.get_const_data(), forest->children.get_const_data(),
         num_rows, forest->postorder.get_data(),
         forest->inv_postorder.get_data());
-    compute_elim_forest_postorder_parent_impl(
+    compute_elimination_forest_postorder_parent_impl(
         forest->parents.get_const_data(),
         forest->inv_postorder.get_const_data(), num_rows,
         forest->postorder_parents.get_data());
@@ -168,12 +169,13 @@ void compute_elim_forest(const matrix::Csr<ValueType, IndexType>* mtx,
 }
 
 
-#define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \
-    void compute_elim_forest(                                 \
-        const matrix::Csr<ValueType, IndexType>* mtx,         \
+#define GKO_DECLARE_COMPUTE_ELIMINATION_FOREST(ValueType, IndexType) \
+    void compute_elimination_forest(                                 \
+        const matrix::Csr<ValueType, IndexType>* mtx,                \
         std::unique_ptr<elimination_forest<IndexType>>& forest)
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_COMPUTE_ELIMINATION_FOREST);
 
 
 }  // namespace factorization
diff --git a/core/factorization/elimination_forest.hpp b/core/factorization/elimination_forest.hpp
index 5307a90384c..50864ec6597 100644
--- a/core/factorization/elimination_forest.hpp
+++ b/core/factorization/elimination_forest.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -41,7 +41,7 @@ struct elimination_forest {
 
 
 template <typename ValueType, typename IndexType>
-void compute_elim_forest(
+void compute_elimination_forest(
     const matrix::Csr<ValueType, IndexType>* mtx,
     std::unique_ptr<elimination_forest<IndexType>>& forest);
 
diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp
new file mode 100644
index 00000000000..39435846119
--- /dev/null
+++ b/core/factorization/elimination_forest_kernels.hpp
@@ -0,0 +1,56 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_
+#define GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_
+
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <memory>
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType)     \
+    void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec, \
+                               const IndexType* row_ptrs,                   \
+                               const IndexType* cols, size_type size,       \
+                               IndexType* out_row_ptrs, IndexType* out_cols)
+
+
+#define GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType) \
+    void from_factor(                                                    \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        const matrix::Csr<ValueType, IndexType>* factors,                \
+        gko::factorization::elimination_forest<IndexType>& forest)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                 \
+    template <typename IndexType>                                    \
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType); \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(elimination_forest,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_
diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp
index 46d45ba5f04..3eb14e05a4a 100644
--- a/core/factorization/ic.cpp
+++ b/core/factorization/ic.cpp
@@ -18,6 +18,7 @@
 #include "core/config/config_helper.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ic_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -37,7 +38,7 @@ GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
 GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
 // for gko syncfree implementation
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
-GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor);
+GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor);
 GKO_REGISTER_OPERATION(initialize, cholesky::initialize);
 GKO_REGISTER_OPERATION(factorize, cholesky::factorize);
 
@@ -119,8 +120,7 @@ std::unique_ptr<Composition<ValueType>> Ic<ValueType, IndexType>::generate(
         forest =
             std::make_unique<gko::factorization::elimination_forest<IndexType>>(
                 exec, num_rows);
-        exec->run(
-            ic_factorization::make_forest_from_factor(factors.get(), *forest));
+        exec->run(ic_factorization::make_from_factor(factors.get(), *forest));
 
         // setup lookup structure on factors
         const auto lookup = matrix::csr::build_lookup(factors.get());
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index b08602e1a1a..92d0bf663cd 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -15,6 +15,7 @@
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
 
@@ -24,6 +25,8 @@ namespace factorization {
 namespace {
 
 
+GKO_REGISTER_OPERATION(compute_skeleton_tree,
+                       elimination_forest::compute_skeleton_tree);
 GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count);
 GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize);
 GKO_REGISTER_OPERATION(prefix_sum_nonnegative,
@@ -32,7 +35,8 @@ GKO_REGISTER_OPERATION(symbolic_factorize_simple,
                        lu_factorization::symbolic_factorize_simple);
 GKO_REGISTER_OPERATION(symbolic_factorize_simple_finalize,
                        lu_factorization::symbolic_factorize_simple_finalize);
-GKO_REGISTER_HOST_OPERATION(compute_elim_forest, compute_elim_forest);
+GKO_REGISTER_HOST_OPERATION(compute_elimination_forest,
+                            compute_elimination_forest);
 
 
 }  // namespace
@@ -47,8 +51,7 @@ void symbolic_cholesky(
     using matrix_type = matrix::Csr<ValueType, IndexType>;
     GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
     const auto exec = mtx->get_executor();
-    const auto host_exec = exec->get_master();
-    exec->run(make_compute_elim_forest(mtx, forest));
+    exec->run(make_compute_elimination_forest(mtx, forest));
     const auto num_rows = mtx->get_size()[0];
     array<IndexType> row_ptrs{exec, num_rows + 1};
     array<IndexType> tmp{exec};
@@ -80,6 +83,55 @@ void symbolic_cholesky(
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);
 
 
+template <typename ValueType, typename IndexType>
+void symbolic_cholesky_device(
+    const matrix::Csr<ValueType, IndexType>* mtx, bool symmetrize,
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,
+    std::unique_ptr<elimination_forest<IndexType>>& forest)
+{
+    using matrix_type = matrix::Csr<ValueType, IndexType>;
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
+    const auto exec = mtx->get_executor();
+    const auto num_rows = mtx->get_size()[0];
+    {
+        const auto skeleton =
+            matrix_type::create(exec, mtx->get_size(), num_rows);
+        exec->run(make_compute_skeleton_tree(
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), num_rows,
+            skeleton->get_row_ptrs(), skeleton->get_col_idxs()));
+        exec->run(make_compute_elimination_forest(skeleton.get(), forest));
+    }
+    array<IndexType> row_ptrs{exec, num_rows + 1};
+    array<IndexType> tmp{exec};
+    exec->run(make_symbolic_count(mtx, *forest, row_ptrs.get_data(), tmp));
+    exec->run(make_prefix_sum_nonnegative(row_ptrs.get_data(), num_rows + 1));
+    const auto factor_nnz =
+        static_cast<size_type>(get_element(row_ptrs, num_rows));
+    factors = matrix_type::create(
+        exec, mtx->get_size(), array<ValueType>{exec, factor_nnz},
+        array<IndexType>{exec, factor_nnz}, std::move(row_ptrs));
+    exec->run(make_symbolic(mtx, *forest, factors.get(), tmp));
+    factors->sort_by_column_index();
+    if (symmetrize) {
+        auto lt_factor = as<matrix_type>(factors->transpose());
+        const auto scalar =
+            initialize<matrix::Dense<ValueType>>({one<ValueType>()}, exec);
+        const auto id = matrix::Identity<ValueType>::create(exec, num_rows);
+        lt_factor->apply(scalar, id, scalar, factors);
+    }
+}
+
+
+#define GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE(ValueType, IndexType)     \
+    void symbolic_cholesky_device(                                     \
+        const matrix::Csr<ValueType, IndexType>* mtx, bool symmetrize, \
+        std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,   \
+        std::unique_ptr<factorization::elimination_forest<IndexType>>& forest)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_lu_near_symm(
     const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp
index 096d8c998bc..f3a6624a762 100644
--- a/core/factorization/symbolic.hpp
+++ b/core/factorization/symbolic.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -25,6 +25,22 @@ void symbolic_cholesky(
     std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,
     std::unique_ptr<elimination_forest<IndexType>>& forest);
 
+
+/**
+ * Computes the symbolic Cholesky factorization of the given matrix on the
+ * device.
+ *
+ * @param mtx  the input matrix
+ * @param symmetrize  output the pattern of L + L^T (true) or just L (false)?
+ * @param factors  the output factor(s)
+ * @param forest  the elimination forest of the input matrix
+ */
+template <typename ValueType, typename IndexType>
+void symbolic_cholesky_device(
+    const matrix::Csr<ValueType, IndexType>* mtx, bool symmetrize,
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors,
+    std::unique_ptr<elimination_forest<IndexType>>& forest);
+
 /**
  * Computes the symbolic LU factorization of the given matrix.
  *
diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp
index 292b366f50e..9bbc9536c93 100644
--- a/core/test/factorization/elimination_forest.cpp
+++ b/core/test/factorization/elimination_forest.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -55,7 +55,7 @@ TYPED_TEST(EliminationForest, WorksForExample)
         this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     GKO_ASSERT_ARRAY_EQ(forest->parents,
                         I<index_type>({2, 4, 6, 8, 8, 6, 7, 8, 9, 10}));
@@ -92,7 +92,7 @@ TYPED_TEST(EliminationForest, WorksForSeparable)
         this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     GKO_ASSERT_ARRAY_EQ(forest->parents,
                         I<index_type>({2, 2, 10, 4, 5, 9, 7, 9, 9, 10}));
@@ -128,7 +128,7 @@ TYPED_TEST(EliminationForest, WorksForPostOrderNotSelfInverse)
         },
         this->ref);
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
     GKO_ASSERT_ARRAY_EQ(forest->parents,
                         I<index_type>({2, 4, 6, 8, 5, 6, 7, 8, 9, 10}));
     GKO_ASSERT_ARRAY_EQ(forest->child_ptrs,
@@ -152,7 +152,7 @@ TYPED_TEST(EliminationForest, WorksForAni1)
     auto mtx = gko::read<matrix_type>(stream, this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     // the elimination tree is a path
     gko::array<index_type> iota_arr{this->ref, 36};
@@ -178,7 +178,7 @@ TYPED_TEST(EliminationForest, WorksForAni1Amd)
     auto mtx = gko::read<matrix_type>(stream, this->ref);
 
     std::unique_ptr<gko::factorization::elimination_forest<index_type>> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
+    gko::factorization::compute_elimination_forest(mtx.get(), forest);
 
     GKO_ASSERT_ARRAY_EQ(
         forest->parents,
diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index 3dae62151b3..6b1014cdba0 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -108,6 +108,20 @@ class biggest_valuetype<
 };
 
 
+template <typename NonzeroIterator>
+auto get_next_value_optional(NonzeroIterator& it, const NonzeroIterator& end,
+                             size_type next_row, size_type next_col)
+    -> std::optional<typename std::decay<decltype(it->value)>::type>
+{
+    if (it != end && it->row == next_row && it->column == next_col) {
+        return std::optional((it++)->value);
+    } else {
+        // monostate
+        return {};
+    }
+}
+
+
 template <typename NonzeroIterator>
 auto get_next_value(NonzeroIterator& it, const NonzeroIterator& end,
                     size_type next_row, size_type next_col) ->
@@ -190,11 +204,12 @@ void print_sparsity_pattern(Ostream& os, const MatrixData1& first,
         os << (row % 10);
         for (size_type col = 0; col < first.size[1]; col++) {
             const auto has_first =
-                get_next_value(first_it, end(first.nonzeros), row, col) !=
-                zero<typename MatrixData1::value_type>();
+                get_next_value_optional(first_it, end(first.nonzeros), row, col)
+                    .has_value();
             const auto has_second =
-                get_next_value(second_it, end(second.nonzeros), row, col) !=
-                zero<typename MatrixData2::value_type>();
+                get_next_value_optional(second_it, end(second.nonzeros), row,
+                                        col)
+                    .has_value();
             if (has_first) {
                 if (has_second) {
                     os << '+';
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
index 834c49dba46..089cbf5c2dd 100755
--- a/dev_tools/scripts/generate_cuda_memory_ptx.py
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -17,6 +17,8 @@ class ordering:
     fn_load_suffix: str
     ptx_store_suffix: str
     fn_store_suffix: str
+    ptx_loadstore_suffix: str
+    fn_loadstore_suffix: str
     is_relaxed: bool
 
 
@@ -27,6 +29,13 @@ class type_desc:
     name: str
 
 
+@dataclasses.dataclass
+class operation:
+    fn_op_suffix: str
+    ptx_op_suffix: str
+    supports_float: bool
+    supports_signed: bool
+
 memory_spaces = [
     space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared",
           ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"),
@@ -35,16 +44,25 @@ class type_desc:
     space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")]
 memory_orderings = [
     ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
-             ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True),
+             ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", 
+             ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True),
     ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire",
-             ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False)
+             ptx_store_suffix=".release", fn_store_suffix="_release",
+             ptx_loadstore_suffix=".acq_rel", fn_loadstore_suffix="_acqrel", is_relaxed=False)
 ]
 types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"),
          type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"),
          type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"),
-         type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")]
+         type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double"),
+         type_desc(ptx_type_suffix=".u32", val_constraint="r", name="uint32"),
+         type_desc(ptx_type_suffix=".u64", val_constraint="l", name="uint64")]
+operations = [operation(fn_op_suffix="_add", ptx_op_suffix=".add", supports_float=True, supports_signed=True),
+              operation(fn_op_suffix="_min", ptx_op_suffix=".min", supports_float=False, supports_signed=True),
+              operation(fn_op_suffix="_max", ptx_op_suffix=".max", supports_float=False, supports_signed=True),
+              operation(fn_op_suffix="_and", ptx_op_suffix=".and", supports_float=False, supports_signed=False),
+              operation(fn_op_suffix="_or", ptx_op_suffix=".or", supports_float=False, supports_signed=False)]
 # header
-print("""// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+print("""// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -149,6 +167,52 @@ class type_desc:
         :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
         : "memory");
 }}
+""")
+            for op in operations:
+                if (not t.ptx_type_suffix.startswith(".f") or op.supports_float) and (not t.ptx_type_suffix.startswith(".s") or op.supports_signed):
+                    # for some reason there is no signed 64 bit atomic support,
+                    # but since the operations are equivalent in two's complement, we can use unsigned instead
+                    new_type = ".u64" if t.ptx_type_suffix == ".s64" and op.fn_op_suffix == "_add" else t.ptx_type_suffix
+                    # non-relaxed atomics are unsupported with SM 6.0 and below
+                    if o.is_relaxed:
+                        print(f"""
+__device__ __forceinline__ {t.name} atomic{op.fn_op_suffix}{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} value)
+{{
+    {t.name} result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;"
+#elif __CUDA_ARCH__ < 700
+        "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;"
+#else
+        "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;"
+#endif
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(value)
+        : "memory");
+    return result;
+}}
+""")
+            # non-relaxed atomics are unsupported with SM 6.0 and below
+            if o.is_relaxed and not t.ptx_type_suffix.startswith(".f"):
+                new_type = ".b" + t.ptx_type_suffix[2:]
+                print(f"""
+__device__ __forceinline__ {t.name} atomic_cas{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} old_val, {t.name} new_val)
+{{
+    {t.name} result;
+    asm volatile(
+#if __CUDA_ARCH__ < 600
+        "atom{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;"
+#elif __CUDA_ARCH__ < 700
+        "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}.cas{new_type} %0, [%1], %2, %3;"
+#else
+        "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;"
+#endif
+        : "={t.val_constraint}"(result)
+        : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(old_val), "{t.val_constraint}"(new_val)
+        : "memory");
+    return result;
+}}
 """)
 
 # vectorized relaxed loads for thrust::complex
@@ -209,14 +273,14 @@ class type_desc:
 __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
 {{
     {t.parent_name} result;
-    asm volatile("{{\\n\\t"
-        "  .reg {t.ptx_type_suffix} t;\\n\\t"
+    asm volatile("{{"
+        "  .reg {t.ptx_type_suffix} t;"
     #if __CUDA_ARCH__ < 700
-        "  ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t"
+        "  ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];"
     #else
-        "  ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t"
+        "  ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];"
     #endif
-        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;"
         "}}"
         : "={t.val_constraint}"(result)
         : "{s.ptr_constraint}"({const_ptr_expr})
@@ -229,13 +293,13 @@ class type_desc:
 __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result)
 {{
     {membar_expression}
-    asm volatile("{{\\n\\t"
-        "  .reg {t.ptx_type_suffix} t;\\n\\t"
-        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t"
+    asm volatile("{{"
+        "  .reg {t.ptx_type_suffix} t;"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;"
     #if __CUDA_ARCH__ < 700
-        "  st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "  st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;"
     #else
-        "  st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "  st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;"
     #endif
         "}}"
         :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result))
@@ -245,7 +309,8 @@ class type_desc:
 
 for s in memory_spaces_without_shared:
     o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
-                 ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True)
+                 ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", 
+                 ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True)
     const_ptr_expr = s.ptr_expr.format(
         ptr=f"const_cast<thrust::complex<{t.name}>*>(ptr)")
     mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
@@ -254,15 +319,15 @@ class type_desc:
 {{
     {t.parent_name} real_result;
     {t.parent_name} imag_result;
-    asm volatile("{{\\n\\t"
-        "  .reg .v2 {t.ptx_type_suffix} t;\\n\\t"
+    asm volatile("{{"
+        "  .reg .v2 {t.ptx_type_suffix} t;"
 #if __CUDA_ARCH__ < 700
-        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t"
+        "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];"
 #else
-        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t"
+        "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];"
 #endif
-        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t"
-        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;"
+        "  cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;"
         "}}"
         : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
         : "{s.ptr_constraint}"({const_ptr_expr})
@@ -275,14 +340,14 @@ class type_desc:
 {{
     auto real_result = static_cast<{t.parent_name}>(result.real());
     auto imag_result = static_cast<{t.parent_name}>(result.imag());
-    asm volatile("{{\\n\\t"
-        "  .reg .v2 {t.ptx_type_suffix} t;\\n\\t"
-        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t"
-        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t"
+    asm volatile("{{"
+        "  .reg .v2 {t.ptx_type_suffix} t;"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;"
+        "  cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;"
 #if __CUDA_ARCH__ < 700
-        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;"
 #else
-        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t"
+        "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;"
 #endif
         "}}"
         :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 81a2a6034ea..a5fc043e171 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -27,6 +27,7 @@ target_sources(ginkgo_dpcpp
     distributed/partition_kernels.dp.cpp
     distributed/vector_kernels.dp.cpp
     factorization/cholesky_kernels.dp.cpp
+    factorization/elimination_forest_kernels.dp.cpp
     factorization/ic_kernels.dp.cpp
     factorization/ilu_kernels.dp.cpp
     factorization/factorization_kernels.dp.cpp
diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp
index 5cd8396be17..76a2f9d2b6a 100644
--- a/dpcpp/factorization/cholesky_kernels.dp.cpp
+++ b/dpcpp/factorization/cholesky_kernels.dp.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -142,16 +142,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename ValueType, typename IndexType>
-void forest_from_factor(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Csr<ValueType, IndexType>* factors,
-                        gko::factorization::elimination_forest<IndexType>&
-                            forest) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
new file mode 100644
index 00000000000..01310b5c448
--- /dev/null
+++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include <sycl/sycl.hpp>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace elimination_forest {
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index 8a98ee822b8..a05bf7de344 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -128,6 +128,51 @@ __device__ __forceinline__ void store_generic(ValueType* ptr, ValueType value)
 }
 
 
+template <typename ValueType, typename AddType>
+__device__ __forceinline__ ValueType atomic_add_relaxed(ValueType* ptr,
+                                                        AddType value)
+{
+    return __atomic_fetch_add(ptr, value, __ATOMIC_RELAXED);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_min_relaxed(ValueType* ptr,
+                                                        ValueType value)
+{
+    return __atomic_fetch_min(ptr, value, __ATOMIC_RELAXED);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_max_relaxed(ValueType* ptr,
+                                                        ValueType value)
+{
+    return __atomic_fetch_max(ptr, value, __ATOMIC_RELAXED);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_cas_relaxed(ValueType* ptr,
+                                                        ValueType old_val,
+                                                        ValueType new_val)
+{
+    __atomic_compare_exchange_n(ptr, &old_val, new_val, false, __ATOMIC_RELAXED,
+                                __ATOMIC_RELAXED);
+    return old_val;
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType atomic_cas_relaxed_local(ValueType* ptr,
+                                                              ValueType old_val,
+                                                              ValueType new_val)
+{
+    // no special optimization available for threadblock-local atomic CAS
+    return atomic_cas_relaxed(ptr, old_val, new_val);
+}
+
+
 template <typename ValueType>
 __device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr)
 {
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 700a01116c9..31f369356f8 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -17,6 +17,7 @@ target_sources(ginkgo_omp
     distributed/partition_kernels.cpp
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
+    factorization/elimination_forest_kernels.cpp
     factorization/factorization_kernels.cpp
     factorization/ic_kernels.cpp
     factorization/ilu_kernels.cpp
diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp
index aa4aabfc731..826e06026da 100644
--- a/omp/factorization/cholesky_kernels.cpp
+++ b/omp/factorization/cholesky_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -9,6 +9,8 @@
 
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
@@ -130,49 +132,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename ValueType, typename IndexType>
-void forest_from_factor(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* factors,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto row_ptrs = factors->get_const_row_ptrs();
-    const auto col_idxs = factors->get_const_col_idxs();
-    const auto parents = forest.parents.get_data();
-    const auto children = forest.children.get_data();
-    const auto child_ptrs = forest.child_ptrs.get_data();
-    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
-    components::fill_array(exec, parents, num_rows, num_rows);
-#pragma omp parallel for
-    for (IndexType l_col = 0; l_col < num_rows; l_col++) {
-        const auto llt_row_begin = row_ptrs[l_col];
-        const auto llt_row_end = row_ptrs[l_col + 1];
-        for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
-            const auto l_row = col_idxs[nz];
-            // parent[j] = min(i | i > j and l_ij =/= 0)
-            // we read from L^T stored above the diagonal in factors
-            // assuming a sorted order of the columns
-            if (l_row > l_col) {
-                parents[l_col] = l_row;
-                break;
-            }
-        }
-    }
-    // group by parent
-    array<IndexType> parents_copy{exec, static_cast<size_type>(num_rows)};
-    exec->copy(num_rows, parents, parents_copy.get_data());
-    components::fill_seq_array(exec, children, num_rows);
-    const auto it =
-        detail::make_zip_iterator(parents_copy.get_data(), children);
-    std::stable_sort(it, it + num_rows);
-    components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(),
-                                     num_rows, num_rows + 1, child_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp
new file mode 100644
index 00000000000..f3f180a515d
--- /dev/null
+++ b/omp/factorization/elimination_forest_kernels.cpp
@@ -0,0 +1,123 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <memory>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace elimination_forest {
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    disjoint_sets<IndexType> sets(exec, size);
+    const auto nnz = static_cast<size_type>(row_ptrs[size]);
+    vector<std::pair<IndexType, IndexType>> edges(exec);
+    edges.reserve(nnz / 2);
+    // collect edge list
+    for (auto row : irange(static_cast<IndexType>(size))) {
+        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
+            const auto col = cols[nz];
+            if (col >= row) {
+                continue;
+            }
+            // edge contains (max, min) pair
+            edges.emplace_back(row, col);
+        }
+    }
+    // the edge list is now sorted by row, which also matches the edge weight
+    // we don't need to do any additional sorting operations
+    assert(std::is_sorted(edges.begin(), edges.end(),
+                          [](auto a, auto b) { return a.first < b.first; }));
+    // output helper array: Store row indices for output rows
+    // since the input is sorted by edge.first == row, this will be sorted
+    vector<IndexType> out_rows(size, exec);
+    IndexType output_count{};
+    // Kruskal algorithm: Connect unconnected components using edges with
+    // ascending weight
+    for (const auto edge : edges) {
+        const auto first_rep = sets.find(edge.first);
+        const auto second_rep = sets.find(edge.second);
+        if (first_rep != second_rep) {
+            // we are only interested in the lower triangle, so we add an edge
+            // max -> min
+            out_rows[output_count] = edge.first;
+            out_cols[output_count] = edge.second;
+            output_count++;
+            sets.join(first_rep, second_rep);
+        }
+    }
+    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
+    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto row_ptrs = factors->get_const_row_ptrs();
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto parents = forest.parents.get_data();
+    const auto children = forest.children.get_data();
+    const auto child_ptrs = forest.child_ptrs.get_data();
+    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
+    components::fill_array(exec, parents, num_rows, num_rows);
+#pragma omp parallel for
+    for (IndexType l_col = 0; l_col < num_rows; l_col++) {
+        const auto llt_row_begin = row_ptrs[l_col];
+        const auto llt_row_end = row_ptrs[l_col + 1];
+        for (auto nz = llt_row_begin; nz < llt_row_end; nz++) {
+            const auto l_row = col_idxs[nz];
+            // parent[j] = min(i | i > j and l_ij =/= 0)
+            // we read from L^T stored above the diagonal in factors
+            // assuming a sorted order of the columns
+            if (l_row > l_col) {
+                parents[l_col] = l_row;
+                break;
+            }
+        }
+    }
+    // group by parent
+    array<IndexType> parents_copy{exec, static_cast<size_type>(num_rows)};
+    exec->copy(num_rows, parents, parents_copy.get_data());
+    components::fill_seq_array(exec, children, num_rows);
+    const auto it =
+        detail::make_zip_iterator(parents_copy.get_data(), children);
+    std::stable_sort(it, it + num_rows);
+    components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(),
+                                     num_rows, num_rows + 1, child_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 94e61d43d5c..27c04a4db09 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -19,6 +19,7 @@ target_sources(ginkgo_reference
     distributed/partition_kernels.cpp
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
+    factorization/elimination_forest_kernels.cpp
     factorization/factorization_kernels.cpp
     factorization/ic_kernels.cpp
     factorization/ilu_kernels.cpp
diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp
index e4d7112a15f..80b6c53663b 100644
--- a/reference/factorization/cholesky_kernels.cpp
+++ b/reference/factorization/cholesky_kernels.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -6,7 +6,6 @@
 
 #include <algorithm>
 #include <memory>
-#include <numeric>
 
 #include <ginkgo/core/matrix/csr.hpp>
 
@@ -14,7 +13,6 @@
 #include "core/base/iterator_factory.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -106,44 +104,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);
 
 
-template <typename ValueType, typename IndexType>
-void forest_from_factor(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* factors,
-    gko::factorization::elimination_forest<IndexType>& forest)
-{
-    const auto row_ptrs = factors->get_const_row_ptrs();
-    const auto col_idxs = factors->get_const_col_idxs();
-    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
-    const auto parents = forest.parents.get_data();
-    const auto children = forest.children.get_data();
-    const auto child_ptrs = forest.child_ptrs.get_data();
-    // filled with sentinel for unattached nodes
-    std::fill_n(parents, num_rows, num_rows);
-    for (IndexType row = 0; row < num_rows; row++) {
-        const auto row_begin = row_ptrs[row];
-        const auto row_end = row_ptrs[row + 1];
-        for (auto nz = row_begin; nz < row_end; nz++) {
-            const auto col = col_idxs[nz];
-            // use the lower triangle, min row from column
-            if (col < row) {
-                parents[col] = std::min(parents[col], row);
-            }
-        }
-    }
-    // group by parent
-    vector<IndexType> parents_copy(parents, parents + num_rows, {exec});
-    std::iota(children, children + num_rows, 0);
-    const auto it = detail::make_zip_iterator(parents_copy.begin(), children);
-    std::stable_sort(it, it + num_rows);
-    components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows,
-                                     num_rows + 1, child_ptrs);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);
-
-
 template <typename ValueType, typename IndexType>
 void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 const matrix::Csr<ValueType, IndexType>* mtx,
diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp
new file mode 100644
index 00000000000..dbf00d02d72
--- /dev/null
+++ b/reference/factorization/elimination_forest_kernels.cpp
@@ -0,0 +1,118 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/elimination_forest.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/base/allocator.hpp"
+#include "core/base/index_range.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace elimination_forest {
+
+
+template <typename IndexType>
+void compute_skeleton_tree(std::shared_ptr<const DefaultExecutor> exec,
+                           const IndexType* row_ptrs, const IndexType* cols,
+                           size_type size, IndexType* out_row_ptrs,
+                           IndexType* out_cols)
+{
+    disjoint_sets<IndexType> sets(exec, size);
+    const auto nnz = static_cast<size_type>(row_ptrs[size]);
+    vector<std::pair<IndexType, IndexType>> edges(exec);
+    edges.reserve(nnz / 2);
+    // collect edge list
+    for (auto row : irange(static_cast<IndexType>(size))) {
+        for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) {
+            const auto col = cols[nz];
+            if (col >= row) {
+                continue;
+            }
+            // edge contains (max, min) pair
+            edges.emplace_back(row, col);
+        }
+    }
+    // the edge list is now sorted by row, which also matches the edge weight
+    // we don't need to do any additional sorting operations
+    assert(std::is_sorted(edges.begin(), edges.end(),
+                          [](auto a, auto b) { return a.first < b.first; }));
+    // output helper array: Store row indices for output rows
+    // since the input is sorted by edge.first == row, this will be sorted
+    vector<IndexType> out_rows(size, exec);
+    IndexType output_count{};
+    // Kruskal algorithm: Connect unconnected components using edges with
+    // ascending weight
+    for (const auto edge : edges) {
+        const auto first_rep = sets.find(edge.first);
+        const auto second_rep = sets.find(edge.second);
+        if (first_rep != second_rep) {
+            // we are only interested in the lower triangle, so we add an edge
+            // max -> min
+            out_rows[output_count] = edge.first;
+            out_cols[output_count] = edge.second;
+            output_count++;
+            sets.join(first_rep, second_rep);
+        }
+    }
+    assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count));
+    components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size,
+                                     out_row_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE);
+
+
+template <typename ValueType, typename IndexType>
+void from_factor(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Csr<ValueType, IndexType>* factors,
+                 gko::factorization::elimination_forest<IndexType>& forest)
+{
+    const auto row_ptrs = factors->get_const_row_ptrs();
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto num_rows = static_cast<IndexType>(factors->get_size()[0]);
+    const auto parents = forest.parents.get_data();
+    const auto children = forest.children.get_data();
+    const auto child_ptrs = forest.child_ptrs.get_data();
+    // filled with sentinel for unattached nodes
+    std::fill_n(parents, num_rows, num_rows);
+    for (IndexType row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_end = row_ptrs[row + 1];
+        for (auto nz = row_begin; nz < row_end; nz++) {
+            const auto col = col_idxs[nz];
+            // use the lower triangle, min row from column
+            if (col < row) {
+                parents[col] = std::min(parents[col], row);
+            }
+        }
+    }
+    // group by parent
+    vector<IndexType> parents_copy(parents, parents + num_rows, {exec});
+    std::iota(children, children + num_rows, 0);
+    const auto it = detail::make_zip_iterator(parents_copy.begin(), children);
+    std::stable_sort(it, it + num_rows);
+    components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows,
+                                     num_rows + 1, child_ptrs);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR);
+
+
+}  // namespace elimination_forest
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index 630aa2d0406..0638f8ca931 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -17,6 +17,7 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
@@ -91,7 +92,8 @@ class Cholesky : public ::testing::Test {
                                        l_factor_ref->get_num_stored_elements());
         combined = matrix_type::create(ref, combined_ref->get_size(),
                                        combined_ref->get_num_stored_elements());
-        gko::factorization::compute_elim_forest(l_factor_ref.get(), forest);
+        gko::factorization::compute_elimination_forest(l_factor_ref.get(),
+                                                       forest);
         // init sparsity lookup
         ref->copy(num_rows + 1, l_factor_ref->get_const_row_ptrs(),
                   l_factor->get_row_ptrs());
@@ -140,6 +142,9 @@ class Cholesky : public ::testing::Test {
             fn();
         }
         {
+            // structurally this is the example from Liu 1990
+            // "The Role of Elimination Trees in Sparse Factorization"
+            // https://doi.org/10.1137/0611010.
             SCOPED_TRACE("example");
             this->setup(
                 {{4, 0, 1, 0, 0, 0, 0, 1, 0, 0},
@@ -233,16 +238,43 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
+TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using elimination_forest = typename TestFixture::elimination_forest;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices(
+        [this] {
+            auto skeleton = matrix_type::create(
+                this->ref, this->mtx->get_size(), this->mtx->get_size()[0]);
+            std::unique_ptr<elimination_forest> skeleton_forest;
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
+            gko::kernels::reference::elimination_forest::compute_skeleton_tree(
+                this->ref, this->mtx->get_const_row_ptrs(),
+                this->mtx->get_const_col_idxs(), this->mtx->get_size()[0],
+                skeleton->get_row_ptrs(), skeleton->get_col_idxs());
+
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
+            gko::factorization::compute_elimination_forest(skeleton.get(),
+                                                           skeleton_forest);
+
+            this->assert_equal_forests(*skeleton_forest, *this->forest);
+        },
+        true);
+}
+
+
 TYPED_TEST(Cholesky, KernelSymbolicCount)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
         [this] {
-            gko::factorization::compute_elim_forest(this->mtx.get(),
-                                                    this->forest);
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
             gko::array<index_type> row_nnz{this->ref, this->num_rows};
 
             gko::kernels::reference::cholesky::symbolic_count(
@@ -258,13 +290,12 @@ TYPED_TEST(Cholesky, KernelSymbolicCount)
 TYPED_TEST(Cholesky, KernelSymbolicFactorize)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices(
         [this] {
-            gko::factorization::compute_elim_forest(this->mtx.get(),
-                                                    this->forest);
+            gko::factorization::compute_elimination_forest(this->mtx.get(),
+                                                           this->forest);
             gko::kernels::reference::cholesky::symbolic_count(
                 this->ref, this->mtx.get(), *this->forest,
                 this->l_factor->get_row_ptrs(), this->tmp);
@@ -329,7 +360,7 @@ TYPED_TEST(Cholesky, KernelForestFromFactorPlusPostprocessing)
             elimination_forest forest{this->ref,
                                       static_cast<index_type>(this->num_rows)};
 
-            gko::kernels::reference::cholesky::forest_from_factor(
+            gko::kernels::reference::elimination_forest::from_factor(
                 this->ref, combined_factor.get(), forest);
 
             this->assert_equal_forests(forest, *forest_ref);
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index ae2894d3696..e2696f11d89 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -14,9 +14,11 @@
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 
+#include "core/components/disjoint_sets.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/elimination_forest_kernels.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
@@ -45,14 +47,16 @@ class CholeskySymbolic : public CommonTestFixture {
         matrices.emplace_back(
             "example small",
             gko::initialize<matrix_type>(
-                {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 0, 0, 1}}, ref));
+                {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 1, 0, 1}}, ref));
+        // this is the example from Liu 1990 https://doi.org/10.1137/0611010.
+        // "The Role of Elimination Trees in Sparse Factorization"
         matrices.emplace_back("example", gko::initialize<matrix_type>(
                                              {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0},
-                                              {0, 1, 0, 1, 0, 0, 0, 0, 0, 1},
-                                              {1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+                                              {0, 1, 0, 0, 1, 0, 0, 0, 0, 1},
+                                              {1, 0, 1, 0, 0, 0, 1, 0, 0, 0},
                                               {0, 0, 0, 1, 0, 0, 0, 0, 1, 1},
                                               {0, 1, 0, 0, 1, 0, 0, 0, 1, 1},
-                                              {0, 0, 0, 0, 0, 1, 0, 1, 0, 0},
+                                              {0, 0, 0, 0, 0, 1, 1, 1, 0, 0},
                                               {0, 0, 1, 0, 0, 1, 1, 0, 0, 0},
                                               {1, 0, 0, 0, 0, 1, 0, 1, 1, 1},
                                               {0, 0, 0, 1, 1, 0, 0, 1, 1, 0},
@@ -65,7 +69,7 @@ class CholeskySymbolic : public CommonTestFixture {
                                                 {0, 0, 0, 1, 1, 0, 0, 0, 0, 0},
                                                 {0, 0, 0, 1, 1, 1, 0, 0, 0, 1},
                                                 {0, 0, 0, 0, 1, 1, 0, 0, 0, 0},
-                                                {0, 0, 0, 0, 0, 0, 1, 0, 0, 1},
+                                                {0, 0, 0, 0, 0, 0, 1, 1, 0, 1},
                                                 {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
                                                 {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
                                                 {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}},
@@ -88,6 +92,11 @@ class CholeskySymbolic : public CommonTestFixture {
         std::ifstream ani1_amd_stream{gko::matrices::location_ani1_amd_mtx};
         matrices.emplace_back("ani1_amd",
                               gko::read<matrix_type>(ani1_amd_stream, ref));
+        std::ifstream ani4_stream{gko::matrices::location_ani4_mtx};
+        matrices.emplace_back("ani4", gko::read<matrix_type>(ani4_stream, ref));
+        std::ifstream ani4_amd_stream{gko::matrices::location_ani4_amd_mtx};
+        matrices.emplace_back("ani4_amd",
+                              gko::read<matrix_type>(ani4_amd_stream, ref));
     }
 
     void assert_equal_forests(elimination_forest& lhs, elimination_forest& rhs,
@@ -103,6 +112,27 @@ class CholeskySymbolic : public CommonTestFixture {
         }
     }
 
+    index_type check_mst(const std::unique_ptr<matrix_type>& mst) const
+    {
+        gko::matrix_data<value_type, index_type> mst_data;
+        const auto size = mst->get_size();
+        mst->write(mst_data);
+        gko::disjoint_sets<index_type> sets(this->ref, size[0]);
+        index_type weight_sum{};
+        // need an IIFE because the assertions would return something
+        [&] {
+            for (const auto entry : mst_data.nonzeros) {
+                ASSERT_GT(entry.row, entry.column);
+                weight_sum += entry.row;
+                const auto row_rep = sets.find(entry.row);
+                const auto col_rep = sets.find(entry.column);
+                ASSERT_NE(row_rep, col_rep);
+                sets.join(row_rep, col_rep);
+            }
+        }();
+        return weight_sum;
+    }
+
     std::vector<std::pair<std::string, std::unique_ptr<const matrix_type>>>
         matrices;
     gko::array<index_type> tmp;
@@ -125,6 +155,52 @@ using Types = gko::test::cartesian_type_product_t<gko::test::RealValueTypesBase,
 TYPED_TEST_SUITE(CholeskySymbolic, Types, PairTypenameNameGenerator);
 
 
+TYPED_TEST(CholeskySymbolic, KernelComputeSkeletonTree)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using index_type = typename TestFixture::index_type;
+    using elimination_forest = typename TestFixture::elimination_forest;
+    for (const auto& pair : this->matrices) {
+        SCOPED_TRACE(pair.first);
+        const auto& mtx = pair.second;
+        // check for correctness: is mtx symmetric?
+        GKO_ASSERT_MTX_EQ_SPARSITY(mtx, gko::as<matrix_type>(mtx->transpose()));
+        const auto dmtx = gko::clone(this->exec, mtx);
+        const auto size = mtx->get_size();
+        const auto skeleton = matrix_type::create(this->ref, size, size[0]);
+        const auto dskeleton = matrix_type::create(this->exec, size, size[0]);
+
+        gko::kernels::reference::elimination_forest::compute_skeleton_tree(
+            this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            size[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs());
+        gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest::
+            compute_skeleton_tree(this->exec, dmtx->get_const_row_ptrs(),
+                                  dmtx->get_const_col_idxs(), size[0],
+                                  dskeleton->get_row_ptrs(),
+                                  dskeleton->get_col_idxs());
+
+        // check that the created graphs are trees and have the same edge sum
+        const auto weight_sum = this->check_mst(skeleton);
+        const auto dweight_sum = this->check_mst(dskeleton);
+        ASSERT_EQ(weight_sum, dweight_sum);
+        // while the MSTs may be different, they should produce the same
+        // elimination forest as the original matrix
+        std::unique_ptr<elimination_forest> forest;
+        std::unique_ptr<elimination_forest> skeleton_forest;
+        std::unique_ptr<elimination_forest> dskeleton_forest;
+        gko::factorization::compute_elimination_forest(mtx.get(), forest);
+        gko::factorization::compute_elimination_forest(skeleton.get(),
+                                                       skeleton_forest);
+        gko::factorization::compute_elimination_forest(dskeleton.get(),
+                                                       dskeleton_forest);
+        // the parents array fully determines the elimination forest
+        GKO_ASSERT_ARRAY_EQ(forest->parents, skeleton_forest->parents);
+        GKO_ASSERT_ARRAY_EQ(skeleton_forest->parents,
+                            dskeleton_forest->parents);
+    }
+}
+
+
 TYPED_TEST(CholeskySymbolic, KernelSymbolicCount)
 {
     using matrix_type = typename TestFixture::matrix_type;
@@ -136,8 +212,8 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount)
         const auto dmtx = gko::clone(this->exec, mtx);
         std::unique_ptr<elimination_forest> forest;
         std::unique_ptr<elimination_forest> dforest;
-        gko::factorization::compute_elim_forest(mtx.get(), forest);
-        gko::factorization::compute_elim_forest(dmtx.get(), dforest);
+        gko::factorization::compute_elimination_forest(mtx.get(), forest);
+        gko::factorization::compute_elimination_forest(dmtx.get(), dforest);
         gko::array<index_type> row_nnz{this->ref, mtx->get_size()[0]};
         gko::array<index_type> drow_nnz{this->exec, mtx->get_size()[0]};
 
@@ -163,7 +239,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize)
         const auto dmtx = gko::clone(this->exec, mtx);
         const auto num_rows = mtx->get_size()[0];
         std::unique_ptr<elimination_forest> forest;
-        gko::factorization::compute_elim_forest(mtx.get(), forest);
+        gko::factorization::compute_elimination_forest(mtx.get(), forest);
         gko::array<index_type> row_ptrs{this->ref, num_rows + 1};
         gko::kernels::reference::cholesky::symbolic_count(
             this->ref, mtx.get(), *forest, row_ptrs.get_data(), this->tmp);
@@ -180,7 +256,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize)
             gko::array<index_type>{this->exec, nnz}, row_ptrs);
         // need to call the device kernels to initialize dtmp
         std::unique_ptr<elimination_forest> dforest;
-        gko::factorization::compute_elim_forest(dmtx.get(), dforest);
+        gko::factorization::compute_elimination_forest(dmtx.get(), dforest);
         gko::array<index_type> dtmp_ptrs{this->exec, num_rows + 1};
         gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count(
             this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp);
@@ -232,7 +308,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks)
         elimination_forest dforest{this->exec,
                                    static_cast<index_type>(mtx->get_size()[0])};
 
-        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor(
+        gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest::from_factor(
             this->exec, dfactors.get(), dforest);
 
         this->assert_equal_forests(*forest, dforest);
@@ -280,8 +356,9 @@ class Cholesky : public CommonTestFixture {
         mtx_chol_sparsity->copy_from(mtx_chol.get());
         dmtx_chol_sparsity = sparsity_pattern_type::create(exec);
         dmtx_chol_sparsity->copy_from(mtx_chol_sparsity.get());
-        gko::factorization::compute_elim_forest(mtx_chol.get(), forest);
-        gko::factorization::compute_elim_forest(dmtx_chol.get(), dforest);
+        gko::factorization::compute_elimination_forest(mtx_chol.get(), forest);
+        gko::factorization::compute_elimination_forest(dmtx_chol.get(),
+                                                       dforest);
     }
 
     void forall_matrices(std::function<void()> fn)