diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp index 61eeaeeb627..50c86c19bcf 100644 --- a/benchmark/sparse_blas/operations.cpp +++ b/benchmark/sparse_blas/operations.cpp @@ -614,8 +614,9 @@ class SymbolicLuNearSymmOperation : public BenchmarkOperation { class SymbolicCholeskyOperation : public BenchmarkOperation { public: - explicit SymbolicCholeskyOperation(const Mtx* mtx, bool symmetric) - : mtx_{mtx}, symmetric_{symmetric}, result_{} + explicit SymbolicCholeskyOperation(const Mtx* mtx, bool device, + bool symmetric) + : mtx_{mtx}, device_{device}, symmetric_{symmetric}, result_{} {} std::pair validate() const override @@ -643,8 +644,13 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { void run() override { - gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_, - forest_); + if (device_) { + gko::factorization::symbolic_cholesky_device(mtx_, symmetric_, + result_, forest_); + } else { + gko::factorization::symbolic_cholesky(mtx_, symmetric_, result_, + forest_); + } } void write_stats(json& object) override @@ -654,6 +660,7 @@ class SymbolicCholeskyOperation : public BenchmarkOperation { private: const Mtx* mtx_; + bool device_; bool symmetric_; std::unique_ptr result_; std::unique_ptr> forest_; @@ -791,13 +798,25 @@ const std::map(mtx); }}, + {"symbolic_cholesky_device", + [](const Mtx* mtx) { + return std::make_unique(mtx, true, + false); + }}, + {"symbolic_cholesky_device_symmetric", + [](const Mtx* mtx) { + return std::make_unique(mtx, true, + true); + }}, {"symbolic_cholesky", [](const Mtx* mtx) { - return std::make_unique(mtx, false); + return std::make_unique(mtx, false, + false); }}, {"symbolic_cholesky_symmetric", [](const Mtx* mtx) { - return std::make_unique(mtx, true); + return std::make_unique(mtx, false, + true); }}, {"reorder_rcm", [](const Mtx* mtx) { diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout index b5fc8998be0..f40bf8814dd 100644 --- a/benchmark/test/reference/sparse_blas.reordered.stdout +++ b/benchmark/test/reference/sparse_blas.reordered.stdout @@ -9,7 +9,7 @@ "bandwidth": 1.0, "repetitions": 10, "components": { - "compute_elim_forest": 1.0, + "compute_elimination_forest": 1.0, "allocate": 1.0, "free": 1.0, "components::fill_array": 1.0, diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 5cfa55ca687..d7d1de30dea 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -12,6 +12,7 @@ set(CUDA_HIP_SOURCES distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp factorization/factorization_kernels.cpp + factorization/elimination_forest_kernels.cpp factorization/ic_kernels.cpp factorization/ilu_kernels.cpp factorization/lu_kernels.cpp diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index f759c613f45..f77481cd793 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -96,6 +96,83 @@ __device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result) } +__device__ __forceinline__ int32 atomic_add_relaxed_shared(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_min_relaxed_shared(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_max_relaxed_shared(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_cas_relaxed_shared(int32* ptr, + int32 old_val, + int32 new_val) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr) { int64 result; @@ -127,6 +204,83 @@ __device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result) } +__device__ __forceinline__ int64 atomic_add_relaxed_shared(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_min_relaxed_shared(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_max_relaxed_shared(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_cas_relaxed_shared(int64* ptr, + int64 old_val, + int64 new_val) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + __device__ __forceinline__ float load_relaxed_shared(const float* ptr) { float result; @@ -158,6 +312,25 @@ __device__ __forceinline__ void store_relaxed_shared(float* ptr, float result) } +__device__ __forceinline__ float atomic_add_relaxed_shared(float* ptr, + float value) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.f32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.f32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.f32 %0, [%1], %2;" +#endif + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "f"(value) + : "memory"); + return result; +} + + __device__ __forceinline__ double load_relaxed_shared(const double* ptr) { double result; @@ -189,31 +362,49 @@ __device__ __forceinline__ void store_relaxed_shared(double* ptr, double result) } -__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) +__device__ __forceinline__ double atomic_add_relaxed_shared(double* ptr, + double value) { - int32 result; + double result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.f64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.f64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.f64 %0, [%1], %2;" +#endif + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "d"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 load_relaxed_shared(const uint32* ptr) +{ + uint32 result; asm volatile( #if __CUDA_ARCH__ < 700 - "ld.volatile.shared.s32 %0, [%1];" + "ld.volatile.shared.u32 %0, [%1];" #else - "ld.acquire.cta.shared.s32 %0, [%1];" + "ld.relaxed.cta.shared.u32 %0, [%1];" #endif : "=r"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); - membar_acq_rel_shared(); + return result; } -__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) +__device__ __forceinline__ void store_relaxed_shared(uint32* ptr, uint32 result) { - membar_acq_rel_shared(); asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.shared.s32 [%0], %1;" + "st.volatile.shared.u32 [%0], %1;" #else - "st.release.cta.shared.s32 [%0], %1;" + "st.relaxed.cta.shared.u32 [%0], %1;" #endif ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(result) @@ -221,31 +412,145 @@ __device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) } -__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) +__device__ __forceinline__ uint32 atomic_add_relaxed_shared(uint32* ptr, + uint32 value) { - int64 result; + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.add.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.add.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_min_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.min.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.min.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_max_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.max.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.max.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_and_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.and.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.and.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.and.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_or_relaxed_shared(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.or.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.or.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.or.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_cas_relaxed_shared(uint32* ptr, + uint32 old_val, + uint32 new_val) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 load_relaxed_shared(const uint64* ptr) +{ + uint64 result; asm volatile( #if __CUDA_ARCH__ < 700 - "ld.volatile.shared.s64 %0, [%1];" + "ld.volatile.shared.u64 %0, [%1];" #else - "ld.acquire.cta.shared.s64 %0, [%1];" + "ld.relaxed.cta.shared.u64 %0, [%1];" #endif : "=l"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) : "memory"); - membar_acq_rel_shared(); + return result; } -__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) +__device__ __forceinline__ void store_relaxed_shared(uint64* ptr, uint64 result) { - membar_acq_rel_shared(); asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.shared.s64 [%0], %1;" + "st.volatile.shared.u64 [%0], %1;" #else - "st.release.cta.shared.s64 [%0], %1;" + "st.relaxed.cta.shared.u64 [%0], %1;" #endif ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(result) @@ -253,140 +558,1329 @@ __device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) } -__device__ __forceinline__ float load_acquire_shared(const float* ptr) +__device__ __forceinline__ uint64 atomic_add_relaxed_shared(uint64* ptr, + uint64 value) { - float result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.shared.f32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.shared.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.add.u64 %0, [%1], %2;" #else - "ld.acquire.cta.shared.f32 %0, [%1];" + "atom.relaxed.cta.shared.add.u64 %0, [%1], %2;" #endif - : "=f"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) : "memory"); - membar_acq_rel_shared(); return result; } -__device__ __forceinline__ void store_release_shared(float* ptr, float result) +__device__ __forceinline__ uint64 atomic_min_relaxed_shared(uint64* ptr, + uint64 value) { - membar_acq_rel_shared(); + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.shared.f32 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.shared.min.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.min.u64 %0, [%1], %2;" #else - "st.release.cta.shared.f32 [%0], %1;" + "atom.relaxed.cta.shared.min.u64 %0, [%1], %2;" #endif - ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), - "f"(result) + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ double load_acquire_shared(const double* ptr) +__device__ __forceinline__ uint64 atomic_max_relaxed_shared(uint64* ptr, + uint64 value) { - double result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.shared.f64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.shared.max.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.max.u64 %0, [%1], %2;" #else - "ld.acquire.cta.shared.f64 %0, [%1];" + "atom.relaxed.cta.shared.max.u64 %0, [%1], %2;" #endif - : "=d"(result) - : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) : "memory"); - membar_acq_rel_shared(); return result; } -__device__ __forceinline__ void store_release_shared(double* ptr, double result) +__device__ __forceinline__ uint64 atomic_and_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.and.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.and.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.and.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_or_relaxed_shared(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.or.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.or.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.shared.or.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_cas_relaxed_shared(uint64* ptr, + uint64 old_val, + uint64 new_val) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.shared.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.shared.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.shared.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(ptr)), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.s32 %0, [%1];" +#else + "ld.acquire.cta.shared.s32 %0, [%1];" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.s32 [%0], %1;" +#else + "st.release.cta.shared.s32 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.s64 %0, [%1];" +#else + "ld.acquire.cta.shared.s64 %0, [%1];" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.s64 [%0], %1;" +#else + "st.release.cta.shared.s64 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ float load_acquire_shared(const float* ptr) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.f32 %0, [%1];" +#else + "ld.acquire.cta.shared.f32 %0, [%1];" +#endif + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(float* ptr, float result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.f32 [%0], %1;" +#else + "st.release.cta.shared.f32 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(result) + : "memory"); +} + + +__device__ __forceinline__ double load_acquire_shared(const double* ptr) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.f64 %0, [%1];" +#else + "ld.acquire.cta.shared.f64 %0, [%1];" +#endif + : "=d"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(double* ptr, double result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.f64 [%0], %1;" +#else + "st.release.cta.shared.f64 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "d"(result) + : "memory"); +} + + +__device__ __forceinline__ uint32 load_acquire_shared(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.u32 %0, [%1];" +#else + "ld.acquire.cta.shared.u32 %0, [%1];" +#endif + : "=r"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(uint32* ptr, uint32 result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.u32 [%0], %1;" +#else + "st.release.cta.shared.u32 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 load_acquire_shared(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.u64 %0, [%1];" +#else + "ld.acquire.cta.shared.u64 %0, [%1];" +#endif + : "=l"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast(ptr))) + : "memory"); + membar_acq_rel_shared(); + return result; +} + + +__device__ __forceinline__ void store_release_shared(uint64* ptr, uint64 result) +{ + membar_acq_rel_shared(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.u64 [%0], %1;" +#else + "st.release.cta.shared.u64 [%0], %1;" +#endif + ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s32 %0, [%1];" +#else + "ld.relaxed.cta.s32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s32 [%0], %1;" +#else + "st.relaxed.cta.s32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 atomic_add_relaxed_local(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_min_relaxed_local(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_max_relaxed_local(int32* ptr, + int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.s32 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_cas_relaxed_local(int32* ptr, + int32 old_val, + int32 new_val) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s64 %0, [%1];" +#else + "ld.relaxed.cta.s64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s64 [%0], %1;" +#else + "st.relaxed.cta.s64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ int64 atomic_add_relaxed_local(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_min_relaxed_local(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_max_relaxed_local(int64* ptr, + int64 value) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.s64 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.s64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 atomic_cas_relaxed_local(int64* ptr, + int64 old_val, + int64 new_val) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ float load_relaxed_local(const float* ptr) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f32 %0, [%1];" +#else + "ld.relaxed.cta.f32 %0, [%1];" +#endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(float* ptr, float result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f32 [%0], %1;" +#else + "st.relaxed.cta.f32 [%0], %1;" +#endif + ::"l"(ptr), + "f"(result) + : "memory"); +} + + +__device__ __forceinline__ float atomic_add_relaxed_local(float* ptr, + float value) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.f32 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.f32 %0, [%1], %2;" +#endif + : "=f"(result) + : "l"(ptr), "f"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ double load_relaxed_local(const double* ptr) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f64 %0, [%1];" +#else + "ld.relaxed.cta.f64 %0, [%1];" +#endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(double* ptr, double result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f64 [%0], %1;" +#else + "st.relaxed.cta.f64 [%0], %1;" +#endif + ::"l"(ptr), + "d"(result) + : "memory"); +} + + +__device__ __forceinline__ double atomic_add_relaxed_local(double* ptr, + double value) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.f64 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.f64 %0, [%1], %2;" +#endif + : "=d"(result) + : "l"(ptr), "d"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 load_relaxed_local(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u32 %0, [%1];" +#else + "ld.relaxed.cta.u32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(uint32* ptr, uint32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u32 [%0], %1;" +#else + "st.relaxed.cta.u32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint32 atomic_add_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_min_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_max_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_and_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.and.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.and.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.and.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_or_relaxed_local(uint32* ptr, + uint32 value) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.or.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.or.u32 %0, [%1], %2;" +#else + "atom.relaxed.cta.or.u32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 atomic_cas_relaxed_local(uint32* ptr, + uint32 old_val, + uint32 new_val) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 load_relaxed_local(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u64 %0, [%1];" +#else + "ld.relaxed.cta.u64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed_local(uint64* ptr, uint64 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u64 [%0], %1;" +#else + "st.relaxed.cta.u64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 atomic_add_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.add.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.add.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_min_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.min.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.min.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_max_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.max.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.max.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_and_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.and.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.and.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.and.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_or_relaxed_local(uint64* ptr, + uint64 value) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.or.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.or.u64 %0, [%1], %2;" +#else + "atom.relaxed.cta.or.u64 %0, [%1], %2;" +#endif + : "=l"(result) + : "l"(ptr), "l"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint64 atomic_cas_relaxed_local(uint64* ptr, + uint64 old_val, + uint64 new_val) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.cta.cas.b64 %0, [%1], %2, %3;" +#else + "atom.relaxed.cta.cas.b64 %0, [%1], %2, %3;" +#endif + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 load_acquire_local(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s32 %0, [%1];" +#else + "ld.acquire.cta.s32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(int32* ptr, int32 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s32 [%0], %1;" +#else + "st.release.cta.s32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int64 load_acquire_local(const int64* ptr) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s64 %0, [%1];" +#else + "ld.acquire.cta.s64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(int64* ptr, int64 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s64 [%0], %1;" +#else + "st.release.cta.s64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ float load_acquire_local(const float* ptr) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f32 %0, [%1];" +#else + "ld.acquire.cta.f32 %0, [%1];" +#endif + : "=f"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(float* ptr, float result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f32 [%0], %1;" +#else + "st.release.cta.f32 [%0], %1;" +#endif + ::"l"(ptr), + "f"(result) + : "memory"); +} + + +__device__ __forceinline__ double load_acquire_local(const double* ptr) +{ + double result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.f64 %0, [%1];" +#else + "ld.acquire.cta.f64 %0, [%1];" +#endif + : "=d"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(double* ptr, double result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.f64 [%0], %1;" +#else + "st.release.cta.f64 [%0], %1;" +#endif + ::"l"(ptr), + "d"(result) + : "memory"); +} + + +__device__ __forceinline__ uint32 load_acquire_local(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u32 %0, [%1];" +#else + "ld.acquire.cta.u32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(uint32* ptr, uint32 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u32 [%0], %1;" +#else + "st.release.cta.u32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 load_acquire_local(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u64 %0, [%1];" +#else + "ld.acquire.cta.u64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel_local(); + return result; +} + + +__device__ __forceinline__ void store_release_local(uint64* ptr, uint64 result) +{ + membar_acq_rel_local(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u64 [%0], %1;" +#else + "st.release.cta.u64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 load_relaxed(const int32* ptr) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s32 %0, [%1];" +#else + "ld.relaxed.gpu.s32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) +{ + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.s32 [%0], %1;" +#else + "st.relaxed.gpu.s32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ int32 atomic_add_relaxed(int32* ptr, int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.s32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.add.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_min_relaxed(int32* ptr, int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.min.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.s32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.min.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_max_relaxed(int32* ptr, int32 value) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.max.s32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.s32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.max.s32 %0, [%1], %2;" +#endif + : "=r"(result) + : "l"(ptr), "r"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ int32 atomic_cas_relaxed(int32* ptr, int32 old_val, + int32 new_val) +{ + int32 result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b32 %0, [%1], %2, %3;" +#else + "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;" +#endif + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) + : "memory"); + return result; +} + + +__device__ __forceinline__ int64 load_relaxed(const int64* ptr) +{ + int64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.s64 %0, [%1];" +#else + "ld.relaxed.gpu.s64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + + return result; +} + + +__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) { - membar_acq_rel_shared(); asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.shared.f64 [%0], %1;" + "st.volatile.s64 [%0], %1;" #else - "st.release.cta.shared.f64 [%0], %1;" + "st.relaxed.gpu.s64 [%0], %1;" #endif - ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), - "d"(result) + ::"l"(ptr), + "l"(result) : "memory"); } -__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr) +__device__ __forceinline__ int64 atomic_add_relaxed(int64* ptr, int64 value) { - int32 result; + int64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.u64 %0, [%1], %2;" #else - "ld.relaxed.cta.s32 %0, [%1];" + "atom.relaxed.gpu.add.u64 %0, [%1], %2;" #endif - : "=r"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result) +__device__ __forceinline__ int64 atomic_min_relaxed(int64* ptr, int64 value) { + int64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.s32 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.min.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.s64 %0, [%1], %2;" #else - "st.relaxed.cta.s32 [%0], %1;" + "atom.relaxed.gpu.min.s64 %0, [%1], %2;" #endif - ::"l"(ptr), - "r"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr) +__device__ __forceinline__ int64 atomic_max_relaxed(int64* ptr, int64 value) { int64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.max.s64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.s64 %0, [%1], %2;" #else - "ld.relaxed.cta.s64 %0, [%1];" + "atom.relaxed.gpu.max.s64 %0, [%1], %2;" #endif : "=l"(result) - : "l"(const_cast(ptr)) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result) +__device__ __forceinline__ int64 atomic_cas_relaxed(int64* ptr, int64 old_val, + int64 new_val) { + int64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.s64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b64 %0, [%1], %2, %3;" #else - "st.relaxed.cta.s64 [%0], %1;" + "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;" #endif - ::"l"(ptr), - "l"(result) + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) : "memory"); + return result; } -__device__ __forceinline__ float load_relaxed_local(const float* ptr) +__device__ __forceinline__ float load_relaxed(const float* ptr) { float result; asm volatile( #if __CUDA_ARCH__ < 700 "ld.volatile.f32 %0, [%1];" #else - "ld.relaxed.cta.f32 %0, [%1];" + "ld.relaxed.gpu.f32 %0, [%1];" #endif : "=f"(result) : "l"(const_cast(ptr)) @@ -396,13 +1890,13 @@ __device__ __forceinline__ float load_relaxed_local(const float* ptr) } -__device__ __forceinline__ void store_relaxed_local(float* ptr, float result) +__device__ __forceinline__ void store_relaxed(float* ptr, float result) { asm volatile( #if __CUDA_ARCH__ < 700 "st.volatile.f32 [%0], %1;" #else - "st.relaxed.cta.f32 [%0], %1;" + "st.relaxed.gpu.f32 [%0], %1;" #endif ::"l"(ptr), "f"(result) @@ -410,14 +1904,32 @@ __device__ __forceinline__ void store_relaxed_local(float* ptr, float result) } -__device__ __forceinline__ double load_relaxed_local(const double* ptr) +__device__ __forceinline__ float atomic_add_relaxed(float* ptr, float value) +{ + float result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.f32 %0, [%1], %2;" +#else + "atom.relaxed.gpu.add.f32 %0, [%1], %2;" +#endif + : "=f"(result) + : "l"(ptr), "f"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ double load_relaxed(const double* ptr) { double result; asm volatile( #if __CUDA_ARCH__ < 700 "ld.volatile.f64 %0, [%1];" #else - "ld.relaxed.cta.f64 %0, [%1];" + "ld.relaxed.gpu.f64 %0, [%1];" #endif : "=d"(result) : "l"(const_cast(ptr)) @@ -427,13 +1939,13 @@ __device__ __forceinline__ double load_relaxed_local(const double* ptr) } -__device__ __forceinline__ void store_relaxed_local(double* ptr, double result) +__device__ __forceinline__ void store_relaxed(double* ptr, double result) { asm volatile( #if __CUDA_ARCH__ < 700 "st.volatile.f64 [%0], %1;" #else - "st.relaxed.cta.f64 [%0], %1;" + "st.relaxed.gpu.f64 [%0], %1;" #endif ::"l"(ptr), "d"(result) @@ -441,31 +1953,48 @@ __device__ __forceinline__ void store_relaxed_local(double* ptr, double result) } -__device__ __forceinline__ int32 load_acquire_local(const int32* ptr) +__device__ __forceinline__ double atomic_add_relaxed(double* ptr, double value) { - int32 result; + double result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom.add.f64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.f64 %0, [%1], %2;" +#else + "atom.relaxed.gpu.add.f64 %0, [%1], %2;" +#endif + : "=d"(result) + : "l"(ptr), "d"(value) + : "memory"); + return result; +} + + +__device__ __forceinline__ uint32 load_relaxed(const uint32* ptr) +{ + uint32 result; asm volatile( #if __CUDA_ARCH__ < 700 - "ld.volatile.s32 %0, [%1];" + "ld.volatile.u32 %0, [%1];" #else - "ld.acquire.cta.s32 %0, [%1];" + "ld.relaxed.gpu.u32 %0, [%1];" #endif : "=r"(result) - : "l"(const_cast(ptr)) + : "l"(const_cast(ptr)) : "memory"); - membar_acq_rel_local(); + return result; } -__device__ __forceinline__ void store_release_local(int32* ptr, int32 result) +__device__ __forceinline__ void store_relaxed(uint32* ptr, uint32 result) { - membar_acq_rel_local(); asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.s32 [%0], %1;" + "st.volatile.u32 [%0], %1;" #else - "st.release.cta.s32 [%0], %1;" + "st.relaxed.gpu.u32 [%0], %1;" #endif ::"l"(ptr), "r"(result) @@ -473,223 +2002,254 @@ __device__ __forceinline__ void store_release_local(int32* ptr, int32 result) } -__device__ __forceinline__ int64 load_acquire_local(const int64* ptr) +__device__ __forceinline__ uint32 atomic_add_relaxed(uint32* ptr, uint32 value) { - int64 result; + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.add.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.u32 %0, [%1], %2;" #else - "ld.acquire.cta.s64 %0, [%1];" + "atom.relaxed.gpu.add.u32 %0, [%1], %2;" #endif - : "=l"(result) - : "l"(const_cast(ptr)) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(int64* ptr, int64 result) +__device__ __forceinline__ uint32 atomic_min_relaxed(uint32* ptr, uint32 value) { - membar_acq_rel_local(); + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.s64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.min.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.u32 %0, [%1], %2;" #else - "st.release.cta.s64 [%0], %1;" + "atom.relaxed.gpu.min.u32 %0, [%1], %2;" #endif - ::"l"(ptr), - "l"(result) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); + return result; } -__device__ __forceinline__ float load_acquire_local(const float* ptr) +__device__ __forceinline__ uint32 atomic_max_relaxed(uint32* ptr, uint32 value) { - float result; + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.max.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.u32 %0, [%1], %2;" #else - "ld.acquire.cta.f32 %0, [%1];" + "atom.relaxed.gpu.max.u32 %0, [%1], %2;" #endif - : "=f"(result) - : "l"(const_cast(ptr)) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(float* ptr, float result) +__device__ __forceinline__ uint32 atomic_and_relaxed(uint32* ptr, uint32 value) { - membar_acq_rel_local(); + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f32 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.and.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.and.u32 %0, [%1], %2;" #else - "st.release.cta.f32 [%0], %1;" + "atom.relaxed.gpu.and.u32 %0, [%1], %2;" #endif - ::"l"(ptr), - "f"(result) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); + return result; } -__device__ __forceinline__ double load_acquire_local(const double* ptr) +__device__ __forceinline__ uint32 atomic_or_relaxed(uint32* ptr, uint32 value) { - double result; + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.or.u32 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.or.u32 %0, [%1], %2;" #else - "ld.acquire.cta.f64 %0, [%1];" + "atom.relaxed.gpu.or.u32 %0, [%1], %2;" #endif - : "=d"(result) - : "l"(const_cast(ptr)) + : "=r"(result) + : "l"(ptr), "r"(value) : "memory"); - membar_acq_rel_local(); return result; } -__device__ __forceinline__ void store_release_local(double* ptr, double result) +__device__ __forceinline__ uint32 atomic_cas_relaxed(uint32* ptr, + uint32 old_val, + uint32 new_val) { - membar_acq_rel_local(); + uint32 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.cas.b32 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b32 %0, [%1], %2, %3;" #else - "st.release.cta.f64 [%0], %1;" + "atom.relaxed.gpu.cas.b32 %0, [%1], %2, %3;" #endif - ::"l"(ptr), - "d"(result) + : "=r"(result) + : "l"(ptr), "r"(old_val), "r"(new_val) : "memory"); + return result; } -__device__ __forceinline__ int32 load_relaxed(const int32* ptr) +__device__ __forceinline__ uint64 load_relaxed(const uint64* ptr) { - int32 result; + uint64 result; asm volatile( #if __CUDA_ARCH__ < 700 - "ld.volatile.s32 %0, [%1];" + "ld.volatile.u64 %0, [%1];" #else - "ld.relaxed.gpu.s32 %0, [%1];" + "ld.relaxed.gpu.u64 %0, [%1];" #endif - : "=r"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(const_cast(ptr)) : "memory"); return result; } -__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result) +__device__ __forceinline__ void store_relaxed(uint64* ptr, uint64 result) { asm volatile( #if __CUDA_ARCH__ < 700 - "st.volatile.s32 [%0], %1;" + "st.volatile.u64 [%0], %1;" #else - "st.relaxed.gpu.s32 [%0], %1;" + "st.relaxed.gpu.u64 [%0], %1;" #endif ::"l"(ptr), - "r"(result) + "l"(result) : "memory"); } -__device__ __forceinline__ int64 load_relaxed(const int64* ptr) +__device__ __forceinline__ uint64 atomic_add_relaxed(uint64* ptr, uint64 value) { - int64 result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.s64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.add.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.add.u64 %0, [%1], %2;" #else - "ld.relaxed.gpu.s64 %0, [%1];" + "atom.relaxed.gpu.add.u64 %0, [%1], %2;" #endif : "=l"(result) - : "l"(const_cast(ptr)) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result) +__device__ __forceinline__ uint64 atomic_min_relaxed(uint64* ptr, uint64 value) { + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.s64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.min.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.min.u64 %0, [%1], %2;" #else - "st.relaxed.gpu.s64 [%0], %1;" + "atom.relaxed.gpu.min.u64 %0, [%1], %2;" #endif - ::"l"(ptr), - "l"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ float load_relaxed(const float* ptr) +__device__ __forceinline__ uint64 atomic_max_relaxed(uint64* ptr, uint64 value) { - float result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f32 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.max.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.max.u64 %0, [%1], %2;" #else - "ld.relaxed.gpu.f32 %0, [%1];" + "atom.relaxed.gpu.max.u64 %0, [%1], %2;" #endif - : "=f"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed(float* ptr, float result) +__device__ __forceinline__ uint64 atomic_and_relaxed(uint64* ptr, uint64 value) { + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f32 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.and.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.and.u64 %0, [%1], %2;" #else - "st.relaxed.gpu.f32 [%0], %1;" + "atom.relaxed.gpu.and.u64 %0, [%1], %2;" #endif - ::"l"(ptr), - "f"(result) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); + return result; } -__device__ __forceinline__ double load_relaxed(const double* ptr) +__device__ __forceinline__ uint64 atomic_or_relaxed(uint64* ptr, uint64 value) { - double result; + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "ld.volatile.f64 %0, [%1];" +#if __CUDA_ARCH__ < 600 + "atom.or.u64 %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.or.u64 %0, [%1], %2;" #else - "ld.relaxed.gpu.f64 %0, [%1];" + "atom.relaxed.gpu.or.u64 %0, [%1], %2;" #endif - : "=d"(result) - : "l"(const_cast(ptr)) + : "=l"(result) + : "l"(ptr), "l"(value) : "memory"); - return result; } -__device__ __forceinline__ void store_relaxed(double* ptr, double result) +__device__ __forceinline__ uint64 atomic_cas_relaxed(uint64* ptr, + uint64 old_val, + uint64 new_val) { + uint64 result; asm volatile( -#if __CUDA_ARCH__ < 700 - "st.volatile.f64 [%0], %1;" +#if __CUDA_ARCH__ < 600 + "atom.cas.b64 %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom.gpu.cas.b64 %0, [%1], %2, %3;" #else - "st.relaxed.gpu.f64 [%0], %1;" + "atom.relaxed.gpu.cas.b64 %0, [%1], %2, %3;" #endif - ::"l"(ptr), - "d"(result) + : "=l"(result) + : "l"(ptr), "l"(old_val), "l"(new_val) : "memory"); + return result; } @@ -821,6 +2381,70 @@ __device__ __forceinline__ void store_release(double* ptr, double result) } +__device__ __forceinline__ uint32 load_acquire(const uint32* ptr) +{ + uint32 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u32 %0, [%1];" +#else + "ld.acquire.gpu.u32 %0, [%1];" +#endif + : "=r"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(uint32* ptr, uint32 result) +{ + membar_acq_rel(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u32 [%0], %1;" +#else + "st.release.gpu.u32 [%0], %1;" +#endif + ::"l"(ptr), + "r"(result) + : "memory"); +} + + +__device__ __forceinline__ uint64 load_acquire(const uint64* ptr) +{ + uint64 result; + asm volatile( +#if __CUDA_ARCH__ < 700 + "ld.volatile.u64 %0, [%1];" +#else + "ld.acquire.gpu.u64 %0, [%1];" +#endif + : "=l"(result) + : "l"(const_cast(ptr)) + : "memory"); + membar_acq_rel(); + return result; +} + + +__device__ __forceinline__ void store_release(uint64* ptr, uint64 result) +{ + membar_acq_rel(); + asm volatile( +#if __CUDA_ARCH__ < 700 + "st.volatile.u64 [%0], %1;" +#else + "st.release.gpu.u64 [%0], %1;" +#endif + ::"l"(ptr), + "l"(result) + : "memory"); +} + + __device__ __forceinline__ thrust::complex load_relaxed_shared( const thrust::complex* ptr) { @@ -1037,14 +2661,14 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.relaxed.cta.b16 t, [%1];\n\t" + " ld.relaxed.cta.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -1057,13 +2681,13 @@ __device__ __forceinline__ __half load_relaxed_local(const __half* ptr) __device__ __forceinline__ void store_relaxed_local(__half* ptr, __half result) { asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.relaxed.cta.b16 [%0], t;\n\t" + " st.relaxed.cta.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -1075,14 +2699,14 @@ __device__ __forceinline__ __half load_acquire_local(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.acquire.cta.b16 t, [%1];\n\t" + " ld.acquire.cta.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -1096,13 +2720,13 @@ __device__ __forceinline__ void store_release_local(__half* ptr, __half result) { membar_acq_rel_local(); asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.release.cta.b16 [%0], t;\n\t" + " st.release.cta.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -1114,14 +2738,14 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.relaxed.gpu.b16 t, [%1];\n\t" + " ld.relaxed.gpu.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -1134,13 +2758,13 @@ __device__ __forceinline__ __half load_relaxed(const __half* ptr) __device__ __forceinline__ void store_relaxed(__half* ptr, __half result) { asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.relaxed.gpu.b16 [%0], t;\n\t" + " st.relaxed.gpu.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -1152,14 +2776,14 @@ __device__ __forceinline__ __half load_acquire(const __half* ptr) { float result; asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" + "{" + " .reg .f16 t;" #if __CUDA_ARCH__ < 700 - " ld.volatile.b16 t, [%1];\n\t" + " ld.volatile.b16 t, [%1];" #else - " ld.acquire.gpu.b16 t, [%1];\n\t" + " ld.acquire.gpu.b16 t, [%1];" #endif - " cvt.f32.f16 %0, t;\n\t" + " cvt.f32.f16 %0, t;" "}" : "=f"(result) : "l"(const_cast<__half*>(ptr)) @@ -1173,13 +2797,13 @@ __device__ __forceinline__ void store_release(__half* ptr, __half result) { membar_acq_rel(); asm volatile( - "{\n\t" - " .reg .f16 t;\n\t" - " cvt.rn.f16.f32 t, %1;\n\t" + "{" + " .reg .f16 t;" + " cvt.rn.f16.f32 t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile.b16 [%0], t;\n\t" + " st.volatile.b16 [%0], t;" #else - " st.release.gpu.b16 [%0], t;\n\t" + " st.release.gpu.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(static_cast(result)) @@ -1193,15 +2817,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed_local( float real_result; float imag_result; asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" + "{" + " .reg .v2 .f16 t;" #if __CUDA_ARCH__ < 700 - "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.volatile.v2.b16 {t.x, t.y}, [%2];" #else - "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.relaxed.cta.v2.b16 {t.x, t.y}, [%2];" #endif - " cvt.f32.f16 %0, t.x;\n\t" - " cvt.f32.f16 %1, t.y;\n\t" + " cvt.f32.f16 %0, t.x;" + " cvt.f32.f16 %1, t.y;" "}" : "=f"(real_result), "=f"(imag_result) : "l"(const_cast*>(ptr)) @@ -1216,14 +2840,14 @@ __device__ __forceinline__ void store_relaxed_local( auto real_result = static_cast(result.real()); auto imag_result = static_cast(result.imag()); asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" - " cvt.rn.f16.f32 t.x, %1;\n\t" - " cvt.rn.f16.f32 t.y, %2;\n\t" + "{" + " .reg .v2 .f16 t;" + " cvt.rn.f16.f32 t.x, %1;" + " cvt.rn.f16.f32 t.y, %2;" #if __CUDA_ARCH__ < 700 - "st.volatile.v2.b16 [%0], t;\n\t" + "st.volatile.v2.b16 [%0], t;" #else - "st.relaxed.cta.v2.b16 [%0], t;\n\t" + "st.relaxed.cta.v2.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(real_result), "f"(imag_result) @@ -1237,15 +2861,15 @@ __device__ __forceinline__ thrust::complex<__half> load_relaxed( float real_result; float imag_result; asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" + "{" + " .reg .v2 .f16 t;" #if __CUDA_ARCH__ < 700 - "ld.volatile.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.volatile.v2.b16 {t.x, t.y}, [%2];" #else - "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];\n\t" + "ld.relaxed.gpu.v2.b16 {t.x, t.y}, [%2];" #endif - " cvt.f32.f16 %0, t.x;\n\t" - " cvt.f32.f16 %1, t.y;\n\t" + " cvt.f32.f16 %0, t.x;" + " cvt.f32.f16 %1, t.y;" "}" : "=f"(real_result), "=f"(imag_result) : "l"(const_cast*>(ptr)) @@ -1260,14 +2884,14 @@ __device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr, auto real_result = static_cast(result.real()); auto imag_result = static_cast(result.imag()); asm volatile( - "{\n\t" - " .reg .v2 .f16 t;\n\t" - " cvt.rn.f16.f32 t.x, %1;\n\t" - " cvt.rn.f16.f32 t.y, %2;\n\t" + "{" + " .reg .v2 .f16 t;" + " cvt.rn.f16.f32 t.x, %1;" + " cvt.rn.f16.f32 t.y, %2;" #if __CUDA_ARCH__ < 700 - "st.volatile.v2.b16 [%0], t;\n\t" + "st.volatile.v2.b16 [%0], t;" #else - "st.relaxed.gpu.v2.b16 [%0], t;\n\t" + "st.relaxed.gpu.v2.b16 [%0], t;" #endif "}" ::"l"(ptr), "f"(real_result), "f"(imag_result) diff --git a/common/cuda_hip/factorization/cholesky_kernels.cpp b/common/cuda_hip/factorization/cholesky_kernels.cpp index 7ff1382d8c6..ce4dc823b98 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.cpp +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -1,10 +1,11 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause #include "core/factorization/cholesky_kernels.hpp" #include +#include #include #include @@ -21,6 +22,7 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "common/cuda_hip/components/reduction.hpp" #include "common/cuda_hip/components/syncfree.hpp" #include "common/cuda_hip/components/thread_ids.hpp" @@ -45,9 +47,6 @@ namespace cholesky { constexpr int default_block_size = 512; -#include "core/factorization/elimination_forest.hpp" - - namespace kernel { @@ -266,65 +265,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void build_children_from_parents( - std::shared_ptr exec, - gko::factorization::elimination_forest& forest) -{ - const auto num_rows = forest.parents.get_size(); - // build COO representation of the tree - array col_idx_array{exec, num_rows}; - const auto col_idxs = col_idx_array.get_data(); - const auto parents = forest.parents.get_const_data(); - const auto children = forest.children.get_data(); - const auto child_ptrs = forest.child_ptrs.get_data(); - exec->copy(num_rows, parents, col_idxs); - thrust::sequence(thrust_policy(exec), children, children + num_rows, - IndexType{}); - // group by parent - thrust::stable_sort_by_key(thrust_policy(exec), col_idxs, - col_idxs + num_rows, children); - // create child pointers for groups of children - components::convert_idxs_to_ptrs(exec, col_idxs, num_rows, - num_rows + 1, // rows plus sentinel - child_ptrs); -} - - -template -void forest_from_factor( - std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& forest) -{ - const auto num_rows = factors->get_size()[0]; - const auto it = thrust::make_counting_iterator(IndexType{}); - thrust::transform( - thrust_policy(exec), it, it + num_rows, forest.parents.get_data(), - [row_ptrs = factors->get_const_row_ptrs(), - col_idxs = factors->get_const_col_idxs(), - num_rows] __device__(IndexType l_col) { - const auto llt_row_begin = row_ptrs[l_col]; - const auto llt_row_end = row_ptrs[l_col + 1]; - for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { - const auto l_row = col_idxs[nz]; - // parent[j] = min(i | i > j and l_ij =/= 0) - // we read from L^T stored above the diagonal in factors - // assuming a sorted order of the columns - if (l_row > l_col) { - return l_row; - } - } - // sentinel pseudo-root - return static_cast(num_rows); - }); - build_children_from_parents(exec, forest); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/common/cuda_hip/factorization/elimination_forest_kernels.cpp b/common/cuda_hip/factorization/elimination_forest_kernels.cpp new file mode 100644 index 00000000000..52002e9a211 --- /dev/null +++ b/common/cuda_hip/factorization/elimination_forest_kernels.cpp @@ -0,0 +1,371 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace elimination_forest { + + +constexpr int default_block_size = 512; + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void mst_initialize_worklist( + const IndexType* __restrict__ rows, const IndexType* __restrict__ cols, + IndexType size, IndexType* __restrict__ worklist_sources, + IndexType* __restrict__ worklist_targets, + IndexType* __restrict__ worklist_edge_ids, + IndexType* __restrict__ worklist_counter) +{ + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto row = rows[i]; + const auto col = cols[i]; + if (col < row) { + const auto out_i = atomic_add_relaxed(worklist_counter, 1); + worklist_sources[out_i] = row; + worklist_targets[out_i] = col; + worklist_edge_ids[out_i] = i; + } +} + + +template +__device__ IndexType mst_find(const IndexType* parents, IndexType node) +{ + auto parent = parents[node]; + while (parent != node) { + node = parent; + parent = parents[node]; + }; + return parent; +} + + +template +__device__ IndexType mst_find_relaxed(const IndexType* parents, IndexType node) +{ + auto parent = load_relaxed_local(parents + node); + while (parent != node) { + node = parent; + parent = load_relaxed_local(parents + node); + }; + return parent; +} + + +template +__device__ void guarded_atomic_min(IndexType* ptr, IndexType value) +{ + // only execute the atomic if we know that it might have an effect + if (load_relaxed_local(ptr) > value) { + atomic_min_relaxed(ptr, value); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_find_minimum( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, + const IndexType* __restrict__ in_edge_ids, IndexType size, + const IndexType* __restrict__ parents, IndexType* __restrict__ min_edge, + IndexType* __restrict__ worklist_sources, + IndexType* __restrict__ worklist_targets, + IndexType* __restrict__ worklist_edge_ids, + IndexType* __restrict__ worklist_counter) +{ + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + const auto edge_id = in_edge_ids[i]; + const auto source_rep = mst_find(parents, source); + const auto target_rep = mst_find(parents, target); + if (source_rep != target_rep) { + const auto out_i = atomic_add_relaxed(worklist_counter, 1); + worklist_sources[out_i] = source_rep; + worklist_targets[out_i] = target_rep; + worklist_edge_ids[out_i] = edge_id; + guarded_atomic_min(min_edge + source_rep, edge_id); + guarded_atomic_min(min_edge + target_rep, edge_id); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_join_edges( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, + const IndexType* __restrict__ in_edge_ids, IndexType size, + IndexType* __restrict__ parents, const IndexType* __restrict__ min_edge, + const IndexType* __restrict__ edge_sources, + const IndexType* __restrict__ edge_targets, + IndexType* __restrict__ out_sources, IndexType* __restrict__ out_targets, + IndexType* __restrict__ out_counter) +{ + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + const auto edge_id = in_edge_ids[i]; + if (min_edge[source] == edge_id || min_edge[target] == edge_id) { + // join source and sink + const auto source_rep = mst_find_relaxed(parents, source); + const auto target_rep = mst_find_relaxed(parents, target); + assert(source_rep != target_rep); + const auto new_rep = min(source_rep, target_rep); + auto old_rep = max(source_rep, target_rep); + bool repeat = false; + do { + repeat = false; + auto old_parent = + atomic_cas_relaxed(parents + old_rep, old_rep, new_rep); + // if this fails, the parent of old_rep changed recently, so we need + // to try again by updating the parent's parent (hopefully its rep) + if (old_parent != old_rep) { + old_rep = old_parent; + repeat = true; + } + } while (repeat); + const auto out_i = atomic_add_relaxed(out_counter, 1); + out_sources[out_i] = edge_sources[edge_id]; + out_targets[out_i] = edge_targets[edge_id]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void mst_reset_min_edges( + const IndexType* __restrict__ in_sources, + const IndexType* __restrict__ in_targets, IndexType size, + IndexType* __restrict__ min_edge) +{ + constexpr auto sentinel = std::numeric_limits::max(); + const auto i = thread::get_thread_id_flat(); + if (i >= size) { + return; + } + const auto source = in_sources[i]; + const auto target = in_targets[i]; + // we could write the values non-atomically, but this makes race checkers + // happier without a performance penalty (hopefully, thanks to _local) + store_relaxed_local(min_edge + source, sentinel); + store_relaxed_local(min_edge + target, sentinel); +} + + +} // namespace kernel + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + // This is a minimum spanning tree algorithm implementation based on + // A. Fallin, A. Gonzalez, J. Seo, and M. Burtscher, + // "A High-Performance MST Implementation for GPUs,” + // doi: 10.1145/3581784.3607093 + // we don't filter heavy edges since the heaviest edges are necessary to + // reach the last node and we don't need to sort since the COO format + // already sorts by row index. + const auto policy = thrust_policy(exec); + const auto nnz = exec->copy_val_to_host(row_ptrs + size); + // convert edges to COO representation + // the edge list is sorted, since we only consider edges where row > col, + // and the row array (= weights) is sorted coming from row_ptrs + array row_array{exec, static_cast(nnz)}; + const auto rows = row_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, size, rows); + // we assume the matrix is symmetric, so we can remove every second edge + // also round up the worklist size for equal cache alignment between fields + const auto worklist_size = + ceildiv(nnz, config::warp_size * 2) * config::warp_size; + // create 2 worklists consisting of (start, end, edge_id) + array worklist{exec, static_cast(worklist_size * 6)}; + auto wl1_source = worklist.get_data(); + auto wl1_target = wl1_source + worklist_size; + auto wl1_edge_id = wl1_target + worklist_size; + auto wl2_source = wl1_source + 3 * worklist_size; + auto wl2_target = wl1_target + 3 * worklist_size; + auto wl2_edge_id = wl1_edge_id + 3 * worklist_size; + // atomic counters for worklists and output edge list + array counters{exec, 3}; + auto wl1_counter = counters.get_data(); + auto wl2_counter = wl1_counter + 1; + auto output_counter = wl2_counter + 1; + components::fill_array(exec, counters.get_data(), counters.get_size(), + IndexType{}); + // helpers for interacting with worklists + const auto clear_wl1 = [&] { + IndexType value{}; + exec->copy_from(exec->get_master(), 1, &value, wl1_counter); + }; + const auto clear_wl2 = [&] { + IndexType value{}; + exec->copy_from(exec->get_master(), 1, &value, wl2_counter); + }; + const auto get_wl1_size = [&] { + return exec->copy_val_to_host(wl1_counter); + }; + const auto swap_wl1_wl2 = [&] { + std::swap(wl1_source, wl2_source); + std::swap(wl1_target, wl2_target); + std::swap(wl1_edge_id, wl2_edge_id); + std::swap(wl1_counter, wl2_counter); + }; + // initialize every node to a singleton set + array parent_array{exec, size}; + const auto parents = parent_array.get_data(); + components::fill_seq_array(exec, parents, size); + // array storing the minimal edge adjacent to each node + array min_edge_array{exec, size}; + const auto min_edges = min_edge_array.get_data(); + constexpr auto min_edge_sentinel = std::numeric_limits::max(); + components::fill_array(exec, min_edges, size, min_edge_sentinel); + // output row array, to be used in conjunction with out_cols in COO storage + array out_row_array{exec, size}; + const auto out_rows = out_row_array.get_data(); + // initialize worklist1 with forward edges + { + const auto num_blocks = ceildiv(nnz, default_block_size); + kernel::mst_initialize_worklist<<>>( + rows, cols, nnz, wl1_source, wl1_target, wl1_edge_id, wl1_counter); + } + auto wl1_size = get_wl1_size(); + while (wl1_size > 0) { + clear_wl2(); + // attach each node to its smallest adjacent non-cycle edge + { + const auto num_blocks = ceildiv(wl1_size, default_block_size); + kernel::mst_find_minimum<<>>( + wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, + min_edges, wl2_source, wl2_target, wl2_edge_id, wl2_counter); + } + clear_wl1(); + swap_wl1_wl2(); + wl1_size = get_wl1_size(); + if (wl1_size > 0) { + // join minimal edges + const auto num_blocks = ceildiv(wl1_size, default_block_size); + kernel::mst_join_edges<<>>( + wl1_source, wl1_target, wl1_edge_id, wl1_size, parents, + min_edges, rows, cols, out_rows, out_cols, output_counter); + kernel::mst_reset_min_edges<<>>( + wl1_source, wl1_target, wl1_size, min_edges); + } + } + const auto num_mst_edges = exec->copy_val_to_host(output_counter); + // two separate sort calls get turned into efficient RadixSort invocations + thrust::sort_by_key(policy, out_cols, out_cols + num_mst_edges, out_rows); + thrust::stable_sort_by_key(policy, out_rows, out_rows + num_mst_edges, + out_cols); + components::convert_idxs_to_ptrs(exec, out_rows, num_mst_edges, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void build_children_from_parents( + std::shared_ptr exec, + gko::factorization::elimination_forest& forest) +{ + const auto num_rows = forest.parents.get_size(); + // build COO representation of the tree + array col_idx_array{exec, num_rows}; + const auto col_idxs = col_idx_array.get_data(); + const auto parents = forest.parents.get_const_data(); + const auto children = forest.children.get_data(); + const auto child_ptrs = forest.child_ptrs.get_data(); + exec->copy(num_rows, parents, col_idxs); + thrust::sequence(thrust_policy(exec), children, children + num_rows, + IndexType{}); + // group by parent + thrust::stable_sort_by_key(thrust_policy(exec), col_idxs, + col_idxs + num_rows, children); + // create child pointers for groups of children + components::convert_idxs_to_ptrs(exec, col_idxs, num_rows, + num_rows + 1, // rows plus sentinel + child_ptrs); +} + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) +{ + const auto num_rows = factors->get_size()[0]; + const auto it = thrust::make_counting_iterator(IndexType{}); + thrust::transform( + thrust_policy(exec), it, it + num_rows, forest.parents.get_data(), + [row_ptrs = factors->get_const_row_ptrs(), + col_idxs = factors->get_const_col_idxs(), + num_rows] __device__(IndexType l_col) { + const auto llt_row_begin = row_ptrs[l_col]; + const auto llt_row_end = row_ptrs[l_col + 1]; + for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { + const auto l_row = col_idxs[nz]; + // parent[j] = min(i | i > j and l_ij =/= 0) + // we read from L^T stored above the diagonal in factors + // assuming a sorted order of the columns + if (l_row > l_col) { + return l_row; + } + } + // sentinel pseudo-root + return static_cast(num_rows); + }); + build_children_from_parents(exec, forest); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index bd2423d4306..8b3167eca97 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -35,6 +35,7 @@ #include "common/cuda_hip/components/cooperative_groups.hpp" #include "common/cuda_hip/components/format_conversion.hpp" #include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" #include "common/cuda_hip/components/merging.hpp" #include "common/cuda_hip/components/prefix_sum.hpp" #include "common/cuda_hip/components/reduction.hpp" @@ -1293,7 +1294,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( } #else if (i < row_len) { - while (atomicCAS(local_storage + hash, empty, i) != empty) { + while (atomic_cas_relaxed_local(local_storage + hash, empty, + static_cast(i)) != empty) { hash++; if (hash >= available_storage) { hash = 0; diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp index 2bb18cbdd22..96370647721 100644 --- a/common/cuda_hip/reorder/rcm_kernels.cpp +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -153,22 +153,6 @@ __device__ __forceinline__ IndexType disjoint_set_find(IndexType node, } -template -struct atomic_map {}; - - -template <> -struct atomic_map { - using type = int; -}; - - -template <> -struct atomic_map { - using type = unsigned long long; -}; - - template __global__ __launch_bounds__(default_block_size) void connected_components_combine( @@ -176,7 +160,6 @@ __launch_bounds__(default_block_size) void connected_components_combine( const IndexType* __restrict__ col_idxs, IndexType num_rows, IndexType* __restrict__ parents) { - using atomic_type = typename atomic_map::type; const auto row = thread::get_thread_id_flat(); if (row >= num_rows) { return; @@ -196,10 +179,8 @@ __launch_bounds__(default_block_size) void connected_components_combine( auto& max_parent = col_parent < parent ? parent : col_parent; // attempt to attach the (assumed unattached) larger node to the // smaller node - const auto old_parent = static_cast(atomicCAS( - reinterpret_cast(parents + max_parent), - static_cast(max_parent), - static_cast(min_parent))); + const auto old_parent = atomic_cas_relaxed( + parents + max_parent, max_parent, min_parent); // if unsuccessful, proceed with the parent of the (now known // attached) node if (old_parent != max_parent) { @@ -354,7 +335,6 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel( IndexType level, IndexType* __restrict__ node_levels, IndexType* __restrict__ level_nodes, IndexType* __restrict__ output_ptr) { - using atomic_type = typename atomic_map::type; const auto source = thread::get_thread_id_flat(); if (source >= num_sources) { return; @@ -362,17 +342,13 @@ __global__ __launch_bounds__(default_block_size) void ubfs_level_kernel( const auto row = sources[source]; const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; - atomic_type unsigned_unattached{}; const auto unattached = invalid_index(); - memcpy(&unsigned_unattached, &unattached, sizeof(IndexType)); for (auto nz = begin; nz < end; nz++) { const auto col = col_idxs[nz]; if (node_levels[col] == unattached && - atomicCAS(reinterpret_cast(node_levels + col), - unsigned_unattached, - static_cast(level)) == unsigned_unattached) { - const auto output_pos = - atomicAdd(reinterpret_cast(output_ptr), 1); + atomic_cas_relaxed(node_levels + col, unattached, level) == + unattached) { + const auto output_pos = atomic_add_relaxed(output_ptr, 1); level_nodes[output_pos] = col; } } diff --git a/core/components/disjoint_sets.hpp b/core/components/disjoint_sets.hpp index 601817e69d3..77a9177d172 100644 --- a/core/components/disjoint_sets.hpp +++ b/core/components/disjoint_sets.hpp @@ -1,12 +1,11 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause #ifndef GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_ #define GKO_CORE_COMPONENTS_DISJOINT_SETS_HPP_ - -#include +#include "core/base/allocator.hpp" namespace gko { @@ -31,10 +30,8 @@ class disjoint_sets { * allocate storage. */ explicit disjoint_sets(std::shared_ptr exec, IndexType size) - : parents_{exec->get_master(), static_cast(size)} - { - parents_.fill(-1); - } + : parents_{static_cast(size), -1, exec->get_master()} + {} /** * Returns true if and only if the element x is the representative of its @@ -138,17 +135,14 @@ class disjoint_sets { * * @return the number of elements represented in this data structure. */ - IndexType get_size() const { return parents_.get_size(); } + IndexType get_size() const { return parents_.size(); } private: - const IndexType& parent(IndexType x) const - { - return parents_.get_const_data()[x]; - } + const IndexType& parent(IndexType x) const { return parents_[x]; } - IndexType& parent(IndexType x) { return parents_.get_data()[x]; } + IndexType& parent(IndexType x) { return parents_[x]; } - array parents_; + vector parents_; }; diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 034bc73e388..437e1145c40 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -23,6 +23,7 @@ #include "core/distributed/partition_kernels.hpp" #include "core/distributed/vector_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/ic_kernels.hpp" #include "core/factorization/ilu_kernels.hpp" @@ -929,7 +930,6 @@ namespace cholesky { GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); @@ -937,6 +937,15 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); } // namespace cholesky +namespace elimination_forest { + + +GKO_STUB_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + +} // namespace elimination_forest + + namespace factorization { diff --git a/core/factorization/cholesky.cpp b/core/factorization/cholesky.cpp index ce189e3bbdc..2f8d8630a1e 100644 --- a/core/factorization/cholesky.cpp +++ b/core/factorization/cholesky.cpp @@ -15,6 +15,7 @@ #include "core/config/config_helper.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -27,7 +28,7 @@ namespace { GKO_REGISTER_OPERATION(fill_array, components::fill_array); -GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor); +GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor); GKO_REGISTER_OPERATION(initialize, cholesky::initialize); GKO_REGISTER_OPERATION(factorize, cholesky::factorize); @@ -102,7 +103,7 @@ std::unique_ptr Cholesky::generate_impl( forest = std::make_unique>( exec, num_rows); - exec->run(make_forest_from_factor(factors.get(), *forest)); + exec->run(make_from_factor(factors.get(), *forest)); } // setup lookup structure on factors const auto lookup = matrix::csr::build_lookup(factors.get()); diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index 8230fa4552d..fa87edd7fe3 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -37,13 +37,6 @@ namespace kernels { const array& tmp_storage) -#define GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType) \ - void forest_from_factor( \ - std::shared_ptr exec, \ - const matrix::Csr* factors, \ - gko::factorization::elimination_forest& forest) - - #define GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType) \ void initialize(std::shared_ptr exec, \ const matrix::Csr* mtx, \ @@ -71,8 +64,6 @@ namespace kernels { template \ GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE(ValueType, IndexType); \ template \ - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR(ValueType, IndexType); \ - template \ GKO_DECLARE_CHOLESKY_INITIALIZE(ValueType, IndexType); \ template \ GKO_DECLARE_CHOLESKY_FACTORIZE(ValueType, IndexType) diff --git a/core/factorization/elimination_forest.cpp b/core/factorization/elimination_forest.cpp index 1dc8ff060a0..89bd9505f6d 100644 --- a/core/factorization/elimination_forest.cpp +++ b/core/factorization/elimination_forest.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -13,10 +13,9 @@ namespace { template -void compute_elim_forest_parent_impl(std::shared_ptr host_exec, - const IndexType* row_ptrs, - const IndexType* cols, IndexType num_rows, - IndexType* parent) +void compute_elimination_forest_parent_impl( + std::shared_ptr host_exec, const IndexType* row_ptrs, + const IndexType* cols, IndexType num_rows, IndexType* parent) { disjoint_sets subtrees{host_exec, num_rows}; array subtree_root_array{host_exec, @@ -50,8 +49,10 @@ void compute_elim_forest_parent_impl(std::shared_ptr host_exec, template -void compute_elim_forest_children_impl(const IndexType* parent, IndexType size, - IndexType* child_ptr, IndexType* child) +void compute_elimination_forest_children_impl(const IndexType* parent, + IndexType size, + IndexType* child_ptr, + IndexType* child) { // count how many times each parent occurs, excluding pseudo-root at // parent == size @@ -74,7 +75,7 @@ void compute_elim_forest_children_impl(const IndexType* parent, IndexType size, template -void compute_elim_forest_postorder_impl( +void compute_elimination_forest_postorder_impl( std::shared_ptr host_exec, const IndexType* parent, const IndexType* child_ptr, const IndexType* child, IndexType size, IndexType* postorder, IndexType* inv_postorder) @@ -111,10 +112,9 @@ void compute_elim_forest_postorder_impl( template -void compute_elim_forest_postorder_parent_impl(const IndexType* parent, - const IndexType* inv_postorder, - IndexType size, - IndexType* postorder_parent) +void compute_elimination_forest_postorder_parent_impl( + const IndexType* parent, const IndexType* inv_postorder, IndexType size, + IndexType* postorder_parent) { for (IndexType row = 0; row < size; row++) { postorder_parent[inv_postorder[row]] = @@ -140,26 +140,27 @@ void elimination_forest::set_executor( template -void compute_elim_forest(const matrix::Csr* mtx, - std::unique_ptr>& forest) +void compute_elimination_forest( + const matrix::Csr* mtx, + std::unique_ptr>& forest) { const auto host_exec = mtx->get_executor()->get_master(); const auto host_mtx = make_temporary_clone(host_exec, mtx); const auto num_rows = static_cast(host_mtx->get_size()[0]); forest = std::make_unique>(host_exec, num_rows); - compute_elim_forest_parent_impl(host_exec, host_mtx->get_const_row_ptrs(), - host_mtx->get_const_col_idxs(), num_rows, - forest->parents.get_data()); - compute_elim_forest_children_impl(forest->parents.get_const_data(), - num_rows, forest->child_ptrs.get_data(), - forest->children.get_data()); - compute_elim_forest_postorder_impl( + compute_elimination_forest_parent_impl( + host_exec, host_mtx->get_const_row_ptrs(), + host_mtx->get_const_col_idxs(), num_rows, forest->parents.get_data()); + compute_elimination_forest_children_impl( + forest->parents.get_const_data(), num_rows, + forest->child_ptrs.get_data(), forest->children.get_data()); + compute_elimination_forest_postorder_impl( host_exec, forest->parents.get_const_data(), forest->child_ptrs.get_const_data(), forest->children.get_const_data(), num_rows, forest->postorder.get_data(), forest->inv_postorder.get_data()); - compute_elim_forest_postorder_parent_impl( + compute_elimination_forest_postorder_parent_impl( forest->parents.get_const_data(), forest->inv_postorder.get_const_data(), num_rows, forest->postorder_parents.get_data()); @@ -168,12 +169,13 @@ void compute_elim_forest(const matrix::Csr* mtx, } -#define GKO_DECLARE_COMPUTE_ELIM_FOREST(ValueType, IndexType) \ - void compute_elim_forest( \ - const matrix::Csr* mtx, \ +#define GKO_DECLARE_COMPUTE_ELIMINATION_FOREST(ValueType, IndexType) \ + void compute_elimination_forest( \ + const matrix::Csr* mtx, \ std::unique_ptr>& forest) -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_COMPUTE_ELIM_FOREST); +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_COMPUTE_ELIMINATION_FOREST); } // namespace factorization diff --git a/core/factorization/elimination_forest.hpp b/core/factorization/elimination_forest.hpp index 5307a90384c..50864ec6597 100644 --- a/core/factorization/elimination_forest.hpp +++ b/core/factorization/elimination_forest.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -41,7 +41,7 @@ struct elimination_forest { template -void compute_elim_forest( +void compute_elimination_forest( const matrix::Csr* mtx, std::unique_ptr>& forest); diff --git a/core/factorization/elimination_forest_kernels.hpp b/core/factorization/elimination_forest_kernels.hpp new file mode 100644 index 00000000000..39435846119 --- /dev/null +++ b/core/factorization/elimination_forest_kernels.hpp @@ -0,0 +1,56 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_ +#define GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_ + + +#include "core/factorization/elimination_forest.hpp" + +#include + +#include +#include +#include + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType) \ + void compute_skeleton_tree(std::shared_ptr exec, \ + const IndexType* row_ptrs, \ + const IndexType* cols, size_type size, \ + IndexType* out_row_ptrs, IndexType* out_cols) + + +#define GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType) \ + void from_factor( \ + std::shared_ptr exec, \ + const matrix::Csr* factors, \ + gko::factorization::elimination_forest& forest) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE(IndexType); \ + template \ + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR(ValueType, IndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(elimination_forest, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_FACTORIZATION_ELIMINATION_FOREST_KERNELS_HPP_ diff --git a/core/factorization/ic.cpp b/core/factorization/ic.cpp index 46d45ba5f04..3eb14e05a4a 100644 --- a/core/factorization/ic.cpp +++ b/core/factorization/ic.cpp @@ -18,6 +18,7 @@ #include "core/config/config_helper.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/ic_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -37,7 +38,7 @@ GKO_REGISTER_OPERATION(initialize_row_ptrs_l, GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l); // for gko syncfree implementation GKO_REGISTER_OPERATION(fill_array, components::fill_array); -GKO_REGISTER_OPERATION(forest_from_factor, cholesky::forest_from_factor); +GKO_REGISTER_OPERATION(from_factor, elimination_forest::from_factor); GKO_REGISTER_OPERATION(initialize, cholesky::initialize); GKO_REGISTER_OPERATION(factorize, cholesky::factorize); @@ -119,8 +120,7 @@ std::unique_ptr> Ic::generate( forest = std::make_unique>( exec, num_rows); - exec->run( - ic_factorization::make_forest_from_factor(factors.get(), *forest)); + exec->run(ic_factorization::make_from_factor(factors.get(), *forest)); // setup lookup structure on factors const auto lookup = matrix::csr::build_lookup(factors.get()); diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp index b08602e1a1a..92d0bf663cd 100644 --- a/core/factorization/symbolic.cpp +++ b/core/factorization/symbolic.cpp @@ -15,6 +15,7 @@ #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -24,6 +25,8 @@ namespace factorization { namespace { +GKO_REGISTER_OPERATION(compute_skeleton_tree, + elimination_forest::compute_skeleton_tree); GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count); GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize); GKO_REGISTER_OPERATION(prefix_sum_nonnegative, @@ -32,7 +35,8 @@ GKO_REGISTER_OPERATION(symbolic_factorize_simple, lu_factorization::symbolic_factorize_simple); GKO_REGISTER_OPERATION(symbolic_factorize_simple_finalize, lu_factorization::symbolic_factorize_simple_finalize); -GKO_REGISTER_HOST_OPERATION(compute_elim_forest, compute_elim_forest); +GKO_REGISTER_HOST_OPERATION(compute_elimination_forest, + compute_elimination_forest); } // namespace @@ -47,8 +51,7 @@ void symbolic_cholesky( using matrix_type = matrix::Csr; GKO_ASSERT_IS_SQUARE_MATRIX(mtx); const auto exec = mtx->get_executor(); - const auto host_exec = exec->get_master(); - exec->run(make_compute_elim_forest(mtx, forest)); + exec->run(make_compute_elimination_forest(mtx, forest)); const auto num_rows = mtx->get_size()[0]; array row_ptrs{exec, num_rows + 1}; array tmp{exec}; @@ -80,6 +83,55 @@ void symbolic_cholesky( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY); +template +void symbolic_cholesky_device( + const matrix::Csr* mtx, bool symmetrize, + std::unique_ptr>& factors, + std::unique_ptr>& forest) +{ + using matrix_type = matrix::Csr; + GKO_ASSERT_IS_SQUARE_MATRIX(mtx); + const auto exec = mtx->get_executor(); + const auto num_rows = mtx->get_size()[0]; + { + const auto skeleton = + matrix_type::create(exec, mtx->get_size(), num_rows); + exec->run(make_compute_skeleton_tree( + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), num_rows, + skeleton->get_row_ptrs(), skeleton->get_col_idxs())); + exec->run(make_compute_elimination_forest(skeleton.get(), forest)); + } + array row_ptrs{exec, num_rows + 1}; + array tmp{exec}; + exec->run(make_symbolic_count(mtx, *forest, row_ptrs.get_data(), tmp)); + exec->run(make_prefix_sum_nonnegative(row_ptrs.get_data(), num_rows + 1)); + const auto factor_nnz = + static_cast(get_element(row_ptrs, num_rows)); + factors = matrix_type::create( + exec, mtx->get_size(), array{exec, factor_nnz}, + array{exec, factor_nnz}, std::move(row_ptrs)); + exec->run(make_symbolic(mtx, *forest, factors.get(), tmp)); + factors->sort_by_column_index(); + if (symmetrize) { + auto lt_factor = as(factors->transpose()); + const auto scalar = + initialize>({one()}, exec); + const auto id = matrix::Identity::create(exec, num_rows); + lt_factor->apply(scalar, id, scalar, factors); + } +} + + +#define GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE(ValueType, IndexType) \ + void symbolic_cholesky_device( \ + const matrix::Csr* mtx, bool symmetrize, \ + std::unique_ptr>& factors, \ + std::unique_ptr>& forest) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SYMBOLIC_CHOLESKY_DEVICE); + + template void symbolic_lu_near_symm( const matrix::Csr* mtx, diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp index 096d8c998bc..f3a6624a762 100644 --- a/core/factorization/symbolic.hpp +++ b/core/factorization/symbolic.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -25,6 +25,22 @@ void symbolic_cholesky( std::unique_ptr>& factors, std::unique_ptr>& forest); + +/** + * Computes the symbolic Cholesky factorization of the given matrix on the + * device. + * + * @param mtx the input matrix + * @param symmetrize output the pattern of L + L^T (true) or just L (false)? + * @param factors the output factor(s) + * @param forest the elimination forest of the input matrix + */ +template +void symbolic_cholesky_device( + const matrix::Csr* mtx, bool symmetrize, + std::unique_ptr>& factors, + std::unique_ptr>& forest); + /** * Computes the symbolic LU factorization of the given matrix. * diff --git a/core/test/factorization/elimination_forest.cpp b/core/test/factorization/elimination_forest.cpp index 292b366f50e..9bbc9536c93 100644 --- a/core/test/factorization/elimination_forest.cpp +++ b/core/test/factorization/elimination_forest.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -55,7 +55,7 @@ TYPED_TEST(EliminationForest, WorksForExample) this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ(forest->parents, I({2, 4, 6, 8, 8, 6, 7, 8, 9, 10})); @@ -92,7 +92,7 @@ TYPED_TEST(EliminationForest, WorksForSeparable) this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ(forest->parents, I({2, 2, 10, 4, 5, 9, 7, 9, 9, 10})); @@ -128,7 +128,7 @@ TYPED_TEST(EliminationForest, WorksForPostOrderNotSelfInverse) }, this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ(forest->parents, I({2, 4, 6, 8, 5, 6, 7, 8, 9, 10})); GKO_ASSERT_ARRAY_EQ(forest->child_ptrs, @@ -152,7 +152,7 @@ TYPED_TEST(EliminationForest, WorksForAni1) auto mtx = gko::read(stream, this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); // the elimination tree is a path gko::array iota_arr{this->ref, 36}; @@ -178,7 +178,7 @@ TYPED_TEST(EliminationForest, WorksForAni1Amd) auto mtx = gko::read(stream, this->ref); std::unique_ptr> forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); GKO_ASSERT_ARRAY_EQ( forest->parents, diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 3dae62151b3..6b1014cdba0 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -108,6 +108,20 @@ class biggest_valuetype< }; +template +auto get_next_value_optional(NonzeroIterator& it, const NonzeroIterator& end, + size_type next_row, size_type next_col) + -> std::optionalvalue)>::type> +{ + if (it != end && it->row == next_row && it->column == next_col) { + return std::optional((it++)->value); + } else { + // monostate + return {}; + } +} + + template auto get_next_value(NonzeroIterator& it, const NonzeroIterator& end, size_type next_row, size_type next_col) -> @@ -190,11 +204,12 @@ void print_sparsity_pattern(Ostream& os, const MatrixData1& first, os << (row % 10); for (size_type col = 0; col < first.size[1]; col++) { const auto has_first = - get_next_value(first_it, end(first.nonzeros), row, col) != - zero(); + get_next_value_optional(first_it, end(first.nonzeros), row, col) + .has_value(); const auto has_second = - get_next_value(second_it, end(second.nonzeros), row, col) != - zero(); + get_next_value_optional(second_it, end(second.nonzeros), row, + col) + .has_value(); if (has_first) { if (has_second) { os << '+'; diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 834c49dba46..089cbf5c2dd 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -17,6 +17,8 @@ class ordering: fn_load_suffix: str ptx_store_suffix: str fn_store_suffix: str + ptx_loadstore_suffix: str + fn_loadstore_suffix: str is_relaxed: bool @@ -27,6 +29,13 @@ class type_desc: name: str +@dataclasses.dataclass +class operation: + fn_op_suffix: str + ptx_op_suffix: str + supports_float: bool + supports_signed: bool + memory_spaces = [ space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared", ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"), @@ -35,16 +44,25 @@ class type_desc: space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")] memory_orderings = [ ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", - ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True), + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", + ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True), ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire", - ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False) + ptx_store_suffix=".release", fn_store_suffix="_release", + ptx_loadstore_suffix=".acq_rel", fn_loadstore_suffix="_acqrel", is_relaxed=False) ] types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"), type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"), type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"), - type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")] + type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double"), + type_desc(ptx_type_suffix=".u32", val_constraint="r", name="uint32"), + type_desc(ptx_type_suffix=".u64", val_constraint="l", name="uint64")] +operations = [operation(fn_op_suffix="_add", ptx_op_suffix=".add", supports_float=True, supports_signed=True), + operation(fn_op_suffix="_min", ptx_op_suffix=".min", supports_float=False, supports_signed=True), + operation(fn_op_suffix="_max", ptx_op_suffix=".max", supports_float=False, supports_signed=True), + operation(fn_op_suffix="_and", ptx_op_suffix=".and", supports_float=False, supports_signed=False), + operation(fn_op_suffix="_or", ptx_op_suffix=".or", supports_float=False, supports_signed=False)] # header -print("""// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +print("""// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -149,6 +167,52 @@ class type_desc: :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result) : "memory"); }} +""") + for op in operations: + if (not t.ptx_type_suffix.startswith(".f") or op.supports_float) and (not t.ptx_type_suffix.startswith(".s") or op.supports_signed): + # for some reason there is no signed 64 bit atomic support, + # but since the operations are equivalent in two's complement, we can use unsigned instead + new_type = ".u64" if t.ptx_type_suffix == ".s64" and op.fn_op_suffix == "_add" else t.ptx_type_suffix + # non-relaxed atomics are unsupported with SM 6.0 and below + if o.is_relaxed: + print(f""" +__device__ __forceinline__ {t.name} atomic{op.fn_op_suffix}{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} value) +{{ + {t.name} result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;" +#elif __CUDA_ARCH__ < 700 + "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;" +#else + "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{op.ptx_op_suffix}{new_type} %0, [%1], %2;" +#endif + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(value) + : "memory"); + return result; +}} +""") + # non-relaxed atomics are unsupported with SM 6.0 and below + if o.is_relaxed and not t.ptx_type_suffix.startswith(".f"): + new_type = ".b" + t.ptx_type_suffix[2:] + print(f""" +__device__ __forceinline__ {t.name} atomic_cas{o.fn_loadstore_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} old_val, {t.name} new_val) +{{ + {t.name} result; + asm volatile( +#if __CUDA_ARCH__ < 600 + "atom{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;" +#elif __CUDA_ARCH__ < 700 + "atom{s.ptx_space_suffix}{s.ptx_scope_suffix}.cas{new_type} %0, [%1], %2, %3;" +#else + "atom{o.ptx_loadstore_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}.cas{new_type} %0, [%1], %2, %3;" +#endif + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(old_val), "{t.val_constraint}"(new_val) + : "memory"); + return result; +}} """) # vectorized relaxed loads for thrust::complex @@ -209,14 +273,14 @@ class type_desc: __device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) {{ {t.parent_name} result; - asm volatile("{{\\n\\t" - " .reg {t.ptx_type_suffix} t;\\n\\t" + asm volatile("{{" + " .reg {t.ptx_type_suffix} t;" #if __CUDA_ARCH__ < 700 - " ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + " ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];" #else - " ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + " ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];" #endif - " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;" "}}" : "={t.val_constraint}"(result) : "{s.ptr_constraint}"({const_ptr_expr}) @@ -229,13 +293,13 @@ class type_desc: __device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) {{ {membar_expression} - asm volatile("{{\\n\\t" - " .reg {t.ptx_type_suffix} t;\\n\\t" - " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t" + asm volatile("{{" + " .reg {t.ptx_type_suffix} t;" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;" #if __CUDA_ARCH__ < 700 - " st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + " st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;" #else - " st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + " st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;" #endif "}}" :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result)) @@ -245,7 +309,8 @@ class type_desc: for s in memory_spaces_without_shared: o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", - ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True) + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", + ptx_loadstore_suffix=".relaxed", fn_loadstore_suffix="_relaxed", is_relaxed=True) const_ptr_expr = s.ptr_expr.format( ptr=f"const_cast*>(ptr)") mut_ptr_expr = s.ptr_expr.format(ptr="ptr") @@ -254,15 +319,15 @@ class type_desc: {{ {t.parent_name} real_result; {t.parent_name} imag_result; - asm volatile("{{\\n\\t" - " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" + asm volatile("{{" + " .reg .v2 {t.ptx_type_suffix} t;" #if __CUDA_ARCH__ < 700 - "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t" + "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];" #else - "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];\\n\\t" + "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{t.x, t.y}}, [%2];" #endif - " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t" - " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;" "}}" : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) : "{s.ptr_constraint}"({const_ptr_expr}) @@ -275,14 +340,14 @@ class type_desc: {{ auto real_result = static_cast<{t.parent_name}>(result.real()); auto imag_result = static_cast<{t.parent_name}>(result.imag()); - asm volatile("{{\\n\\t" - " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" - " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t" - " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t" + asm volatile("{{" + " .reg .v2 {t.ptx_type_suffix} t;" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;" #if __CUDA_ARCH__ < 700 - "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;" #else - "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;" #endif "}}" :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 81a2a6034ea..a5fc043e171 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -27,6 +27,7 @@ target_sources(ginkgo_dpcpp distributed/partition_kernels.dp.cpp distributed/vector_kernels.dp.cpp factorization/cholesky_kernels.dp.cpp + factorization/elimination_forest_kernels.dp.cpp factorization/ic_kernels.dp.cpp factorization/ilu_kernels.dp.cpp factorization/factorization_kernels.dp.cpp diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp index 5cd8396be17..76a2f9d2b6a 100644 --- a/dpcpp/factorization/cholesky_kernels.dp.cpp +++ b/dpcpp/factorization/cholesky_kernels.dp.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -142,16 +142,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void forest_from_factor(std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& - forest) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/dpcpp/factorization/elimination_forest_kernels.dp.cpp b/dpcpp/factorization/elimination_forest_kernels.dp.cpp new file mode 100644 index 00000000000..01310b5c448 --- /dev/null +++ b/dpcpp/factorization/elimination_forest_kernels.dp.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include + +#include + +#include + +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace elimination_forest { + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index 8a98ee822b8..a05bf7de344 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -128,6 +128,51 @@ __device__ __forceinline__ void store_generic(ValueType* ptr, ValueType value) } +template +__device__ __forceinline__ ValueType atomic_add_relaxed(ValueType* ptr, + AddType value) +{ + return __atomic_fetch_add(ptr, value, __ATOMIC_RELAXED); +} + + +template +__device__ __forceinline__ ValueType atomic_min_relaxed(ValueType* ptr, + ValueType value) +{ + return __atomic_fetch_min(ptr, value, __ATOMIC_RELAXED); +} + + +template +__device__ __forceinline__ ValueType atomic_max_relaxed(ValueType* ptr, + ValueType value) +{ + return __atomic_fetch_max(ptr, value, __ATOMIC_RELAXED); +} + + +template +__device__ __forceinline__ ValueType atomic_cas_relaxed(ValueType* ptr, + ValueType old_val, + ValueType new_val) +{ + __atomic_compare_exchange_n(ptr, &old_val, new_val, false, __ATOMIC_RELAXED, + __ATOMIC_RELAXED); + return old_val; +} + + +template +__device__ __forceinline__ ValueType atomic_cas_relaxed_local(ValueType* ptr, + ValueType old_val, + ValueType new_val) +{ + // no special optimization available for threadblock-local atomic CAS + return atomic_cas_relaxed(ptr, old_val, new_val); +} + + template __device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr) { diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 700a01116c9..31f369356f8 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -17,6 +17,7 @@ target_sources(ginkgo_omp distributed/partition_kernels.cpp distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp + factorization/elimination_forest_kernels.cpp factorization/factorization_kernels.cpp factorization/ic_kernels.cpp factorization/ilu_kernels.cpp diff --git a/omp/factorization/cholesky_kernels.cpp b/omp/factorization/cholesky_kernels.cpp index aa4aabfc731..826e06026da 100644 --- a/omp/factorization/cholesky_kernels.cpp +++ b/omp/factorization/cholesky_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -9,6 +9,8 @@ #include +#include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" @@ -130,49 +132,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void forest_from_factor( - std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& forest) -{ - const auto row_ptrs = factors->get_const_row_ptrs(); - const auto col_idxs = factors->get_const_col_idxs(); - const auto parents = forest.parents.get_data(); - const auto children = forest.children.get_data(); - const auto child_ptrs = forest.child_ptrs.get_data(); - const auto num_rows = static_cast(factors->get_size()[0]); - components::fill_array(exec, parents, num_rows, num_rows); -#pragma omp parallel for - for (IndexType l_col = 0; l_col < num_rows; l_col++) { - const auto llt_row_begin = row_ptrs[l_col]; - const auto llt_row_end = row_ptrs[l_col + 1]; - for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { - const auto l_row = col_idxs[nz]; - // parent[j] = min(i | i > j and l_ij =/= 0) - // we read from L^T stored above the diagonal in factors - // assuming a sorted order of the columns - if (l_row > l_col) { - parents[l_col] = l_row; - break; - } - } - } - // group by parent - array parents_copy{exec, static_cast(num_rows)}; - exec->copy(num_rows, parents, parents_copy.get_data()); - components::fill_seq_array(exec, children, num_rows); - const auto it = - detail::make_zip_iterator(parents_copy.get_data(), children); - std::stable_sort(it, it + num_rows); - components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(), - num_rows, num_rows + 1, child_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp new file mode 100644 index 00000000000..f3f180a515d --- /dev/null +++ b/omp/factorization/elimination_forest_kernels.cpp @@ -0,0 +1,123 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include + +#include + +#include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" +#include "core/base/iterator_factory.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace elimination_forest { + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + disjoint_sets sets(exec, size); + const auto nnz = static_cast(row_ptrs[size]); + vector> edges(exec); + edges.reserve(nnz / 2); + // collect edge list + for (auto row : irange(static_cast(size))) { + for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { + const auto col = cols[nz]; + if (col >= row) { + continue; + } + // edge contains (max, min) pair + edges.emplace_back(row, col); + } + } + // the edge list is now sorted by row, which also matches the edge weight + // we don't need to do any additional sorting operations + assert(std::is_sorted(edges.begin(), edges.end(), + [](auto a, auto b) { return a.first < b.first; })); + // output helper array: Store row indices for output rows + // since the input is sorted by edge.first == row, this will be sorted + vector out_rows(size, exec); + IndexType output_count{}; + // Kruskal algorithm: Connect unconnected components using edges with + // ascending weight + for (const auto edge : edges) { + const auto first_rep = sets.find(edge.first); + const auto second_rep = sets.find(edge.second); + if (first_rep != second_rep) { + // we are only interested in the lower triangle, so we add an edge + // max -> min + out_rows[output_count] = edge.first; + out_cols[output_count] = edge.second; + output_count++; + sets.join(first_rep, second_rep); + } + } + assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); + components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) +{ + const auto row_ptrs = factors->get_const_row_ptrs(); + const auto col_idxs = factors->get_const_col_idxs(); + const auto parents = forest.parents.get_data(); + const auto children = forest.children.get_data(); + const auto child_ptrs = forest.child_ptrs.get_data(); + const auto num_rows = static_cast(factors->get_size()[0]); + components::fill_array(exec, parents, num_rows, num_rows); +#pragma omp parallel for + for (IndexType l_col = 0; l_col < num_rows; l_col++) { + const auto llt_row_begin = row_ptrs[l_col]; + const auto llt_row_end = row_ptrs[l_col + 1]; + for (auto nz = llt_row_begin; nz < llt_row_end; nz++) { + const auto l_row = col_idxs[nz]; + // parent[j] = min(i | i > j and l_ij =/= 0) + // we read from L^T stored above the diagonal in factors + // assuming a sorted order of the columns + if (l_row > l_col) { + parents[l_col] = l_row; + break; + } + } + } + // group by parent + array parents_copy{exec, static_cast(num_rows)}; + exec->copy(num_rows, parents, parents_copy.get_data()); + components::fill_seq_array(exec, children, num_rows); + const auto it = + detail::make_zip_iterator(parents_copy.get_data(), children); + std::stable_sort(it, it + num_rows); + components::convert_idxs_to_ptrs(exec, parents_copy.get_const_data(), + num_rows, num_rows + 1, child_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 94e61d43d5c..27c04a4db09 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -19,6 +19,7 @@ target_sources(ginkgo_reference distributed/partition_kernels.cpp distributed/vector_kernels.cpp factorization/cholesky_kernels.cpp + factorization/elimination_forest_kernels.cpp factorization/factorization_kernels.cpp factorization/ic_kernels.cpp factorization/ilu_kernels.cpp diff --git a/reference/factorization/cholesky_kernels.cpp b/reference/factorization/cholesky_kernels.cpp index e4d7112a15f..80b6c53663b 100644 --- a/reference/factorization/cholesky_kernels.cpp +++ b/reference/factorization/cholesky_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -6,7 +6,6 @@ #include #include -#include #include @@ -14,7 +13,6 @@ #include "core/base/iterator_factory.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" #include "core/factorization/lu_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -106,44 +104,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE); -template -void forest_from_factor( - std::shared_ptr exec, - const matrix::Csr* factors, - gko::factorization::elimination_forest& forest) -{ - const auto row_ptrs = factors->get_const_row_ptrs(); - const auto col_idxs = factors->get_const_col_idxs(); - const auto num_rows = static_cast(factors->get_size()[0]); - const auto parents = forest.parents.get_data(); - const auto children = forest.children.get_data(); - const auto child_ptrs = forest.child_ptrs.get_data(); - // filled with sentinel for unattached nodes - std::fill_n(parents, num_rows, num_rows); - for (IndexType row = 0; row < num_rows; row++) { - const auto row_begin = row_ptrs[row]; - const auto row_end = row_ptrs[row + 1]; - for (auto nz = row_begin; nz < row_end; nz++) { - const auto col = col_idxs[nz]; - // use the lower triangle, min row from column - if (col < row) { - parents[col] = std::min(parents[col], row); - } - } - } - // group by parent - vector parents_copy(parents, parents + num_rows, {exec}); - std::iota(children, children + num_rows, 0); - const auto it = detail::make_zip_iterator(parents_copy.begin(), children); - std::stable_sort(it, it + num_rows); - components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows, - num_rows + 1, child_ptrs); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR); - - template void initialize(std::shared_ptr exec, const matrix::Csr* mtx, diff --git a/reference/factorization/elimination_forest_kernels.cpp b/reference/factorization/elimination_forest_kernels.cpp new file mode 100644 index 00000000000..dbf00d02d72 --- /dev/null +++ b/reference/factorization/elimination_forest_kernels.cpp @@ -0,0 +1,118 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/elimination_forest.hpp" + +#include +#include +#include + +#include + +#include "core/base/allocator.hpp" +#include "core/base/index_range.hpp" +#include "core/base/iterator_factory.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace reference { +namespace elimination_forest { + + +template +void compute_skeleton_tree(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* cols, + size_type size, IndexType* out_row_ptrs, + IndexType* out_cols) +{ + disjoint_sets sets(exec, size); + const auto nnz = static_cast(row_ptrs[size]); + vector> edges(exec); + edges.reserve(nnz / 2); + // collect edge list + for (auto row : irange(static_cast(size))) { + for (auto nz : irange(row_ptrs[row], row_ptrs[row + 1])) { + const auto col = cols[nz]; + if (col >= row) { + continue; + } + // edge contains (max, min) pair + edges.emplace_back(row, col); + } + } + // the edge list is now sorted by row, which also matches the edge weight + // we don't need to do any additional sorting operations + assert(std::is_sorted(edges.begin(), edges.end(), + [](auto a, auto b) { return a.first < b.first; })); + // output helper array: Store row indices for output rows + // since the input is sorted by edge.first == row, this will be sorted + vector out_rows(size, exec); + IndexType output_count{}; + // Kruskal algorithm: Connect unconnected components using edges with + // ascending weight + for (const auto edge : edges) { + const auto first_rep = sets.find(edge.first); + const auto second_rep = sets.find(edge.second); + if (first_rep != second_rep) { + // we are only interested in the lower triangle, so we add an edge + // max -> min + out_rows[output_count] = edge.first; + out_cols[output_count] = edge.second; + output_count++; + sets.join(first_rep, second_rep); + } + } + assert(std::is_sorted(out_rows.begin(), out_rows.begin() + output_count)); + components::convert_idxs_to_ptrs(exec, out_rows.data(), output_count, size, + out_row_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_COMPUTE_SKELETON_TREE); + + +template +void from_factor(std::shared_ptr exec, + const matrix::Csr* factors, + gko::factorization::elimination_forest& forest) +{ + const auto row_ptrs = factors->get_const_row_ptrs(); + const auto col_idxs = factors->get_const_col_idxs(); + const auto num_rows = static_cast(factors->get_size()[0]); + const auto parents = forest.parents.get_data(); + const auto children = forest.children.get_data(); + const auto child_ptrs = forest.child_ptrs.get_data(); + // filled with sentinel for unattached nodes + std::fill_n(parents, num_rows, num_rows); + for (IndexType row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + // use the lower triangle, min row from column + if (col < row) { + parents[col] = std::min(parents[col], row); + } + } + } + // group by parent + vector parents_copy(parents, parents + num_rows, {exec}); + std::iota(children, children + num_rows, 0); + const auto it = detail::make_zip_iterator(parents_copy.begin(), children); + std::stable_sort(it, it + num_rows); + components::convert_idxs_to_ptrs(exec, parents_copy.data(), num_rows, + num_rows + 1, child_ptrs); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_ELIMINATION_FOREST_FROM_FACTOR); + + +} // namespace elimination_forest +} // namespace reference +} // namespace kernels +} // namespace gko diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp index 630aa2d0406..0638f8ca931 100644 --- a/reference/test/factorization/cholesky_kernels.cpp +++ b/reference/test/factorization/cholesky_kernels.cpp @@ -17,6 +17,7 @@ #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/matrix/csr_lookup.hpp" @@ -91,7 +92,8 @@ class Cholesky : public ::testing::Test { l_factor_ref->get_num_stored_elements()); combined = matrix_type::create(ref, combined_ref->get_size(), combined_ref->get_num_stored_elements()); - gko::factorization::compute_elim_forest(l_factor_ref.get(), forest); + gko::factorization::compute_elimination_forest(l_factor_ref.get(), + forest); // init sparsity lookup ref->copy(num_rows + 1, l_factor_ref->get_const_row_ptrs(), l_factor->get_row_ptrs()); @@ -140,6 +142,9 @@ class Cholesky : public ::testing::Test { fn(); } { + // structurally this is the example from Liu 1990 + // "The Role of Elimination Trees in Sparse Factorization" + // https://doi.org/10.1137/0611010. SCOPED_TRACE("example"); this->setup( {{4, 0, 1, 0, 0, 0, 0, 1, 0, 0}, @@ -233,16 +238,43 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes, PairTypenameNameGenerator); +TYPED_TEST(Cholesky, KernelComputeSkeletonTreeIsEquivalentToOriginalMatrix) +{ + using matrix_type = typename TestFixture::matrix_type; + using elimination_forest = typename TestFixture::elimination_forest; + using index_type = typename TestFixture::index_type; + this->forall_matrices( + [this] { + auto skeleton = matrix_type::create( + this->ref, this->mtx->get_size(), this->mtx->get_size()[0]); + std::unique_ptr skeleton_forest; + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); + gko::kernels::reference::elimination_forest::compute_skeleton_tree( + this->ref, this->mtx->get_const_row_ptrs(), + this->mtx->get_const_col_idxs(), this->mtx->get_size()[0], + skeleton->get_row_ptrs(), skeleton->get_col_idxs()); + + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); + gko::factorization::compute_elimination_forest(skeleton.get(), + skeleton_forest); + + this->assert_equal_forests(*skeleton_forest, *this->forest); + }, + true); +} + + TYPED_TEST(Cholesky, KernelSymbolicCount) { using matrix_type = typename TestFixture::matrix_type; - using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; using index_type = typename TestFixture::index_type; this->forall_matrices( [this] { - gko::factorization::compute_elim_forest(this->mtx.get(), - this->forest); + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); gko::array row_nnz{this->ref, this->num_rows}; gko::kernels::reference::cholesky::symbolic_count( @@ -258,13 +290,12 @@ TYPED_TEST(Cholesky, KernelSymbolicCount) TYPED_TEST(Cholesky, KernelSymbolicFactorize) { using matrix_type = typename TestFixture::matrix_type; - using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type; using elimination_forest = typename TestFixture::elimination_forest; using index_type = typename TestFixture::index_type; this->forall_matrices( [this] { - gko::factorization::compute_elim_forest(this->mtx.get(), - this->forest); + gko::factorization::compute_elimination_forest(this->mtx.get(), + this->forest); gko::kernels::reference::cholesky::symbolic_count( this->ref, this->mtx.get(), *this->forest, this->l_factor->get_row_ptrs(), this->tmp); @@ -329,7 +360,7 @@ TYPED_TEST(Cholesky, KernelForestFromFactorPlusPostprocessing) elimination_forest forest{this->ref, static_cast(this->num_rows)}; - gko::kernels::reference::cholesky::forest_from_factor( + gko::kernels::reference::elimination_forest::from_factor( this->ref, combined_factor.get(), forest); this->assert_equal_forests(forest, *forest_ref); diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp index ae2894d3696..e2696f11d89 100644 --- a/test/factorization/cholesky_kernels.cpp +++ b/test/factorization/cholesky_kernels.cpp @@ -14,9 +14,11 @@ #include #include +#include "core/components/disjoint_sets.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/elimination_forest.hpp" +#include "core/factorization/elimination_forest_kernels.hpp" #include "core/factorization/symbolic.hpp" #include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" @@ -45,14 +47,16 @@ class CholeskySymbolic : public CommonTestFixture { matrices.emplace_back( "example small", gko::initialize( - {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 0, 0, 1}}, ref)); + {{1, 0, 1, 0}, {0, 1, 0, 1}, {1, 0, 1, 0}, {0, 1, 0, 1}}, ref)); + // this is the example from Liu 1990 https://doi.org/10.1137/0611010. + // "The Role of Elimination Trees in Sparse Factorization" matrices.emplace_back("example", gko::initialize( {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0}, - {0, 1, 0, 1, 0, 0, 0, 0, 0, 1}, - {1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, + {0, 1, 0, 0, 1, 0, 0, 0, 0, 1}, + {1, 0, 1, 0, 0, 0, 1, 0, 0, 0}, {0, 0, 0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 0, 1, 0, 0, 0, 1, 1}, - {0, 0, 0, 0, 0, 1, 0, 1, 0, 0}, + {0, 0, 0, 0, 0, 1, 1, 1, 0, 0}, {0, 0, 1, 0, 0, 1, 1, 0, 0, 0}, {1, 0, 0, 0, 0, 1, 0, 1, 1, 1}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 0}, @@ -65,7 +69,7 @@ class CholeskySymbolic : public CommonTestFixture { {0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, {0, 0, 0, 1, 1, 1, 0, 0, 0, 1}, {0, 0, 0, 0, 1, 1, 0, 0, 0, 0}, - {0, 0, 0, 0, 0, 0, 1, 0, 0, 1}, + {0, 0, 0, 0, 0, 0, 1, 1, 0, 1}, {0, 0, 0, 0, 0, 0, 1, 1, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}}, @@ -88,6 +92,11 @@ class CholeskySymbolic : public CommonTestFixture { std::ifstream ani1_amd_stream{gko::matrices::location_ani1_amd_mtx}; matrices.emplace_back("ani1_amd", gko::read(ani1_amd_stream, ref)); + std::ifstream ani4_stream{gko::matrices::location_ani4_mtx}; + matrices.emplace_back("ani4", gko::read(ani4_stream, ref)); + std::ifstream ani4_amd_stream{gko::matrices::location_ani4_amd_mtx}; + matrices.emplace_back("ani4_amd", + gko::read(ani4_amd_stream, ref)); } void assert_equal_forests(elimination_forest& lhs, elimination_forest& rhs, @@ -103,6 +112,27 @@ class CholeskySymbolic : public CommonTestFixture { } } + index_type check_mst(const std::unique_ptr& mst) const + { + gko::matrix_data mst_data; + const auto size = mst->get_size(); + mst->write(mst_data); + gko::disjoint_sets sets(this->ref, size[0]); + index_type weight_sum{}; + // need an IIFE because the assertions would return something + [&] { + for (const auto entry : mst_data.nonzeros) { + ASSERT_GT(entry.row, entry.column); + weight_sum += entry.row; + const auto row_rep = sets.find(entry.row); + const auto col_rep = sets.find(entry.column); + ASSERT_NE(row_rep, col_rep); + sets.join(row_rep, col_rep); + } + }(); + return weight_sum; + } + std::vector>> matrices; gko::array tmp; @@ -125,6 +155,52 @@ using Types = gko::test::cartesian_type_product_tmatrices) { + SCOPED_TRACE(pair.first); + const auto& mtx = pair.second; + // check for correctness: is mtx symmetric? + GKO_ASSERT_MTX_EQ_SPARSITY(mtx, gko::as(mtx->transpose())); + const auto dmtx = gko::clone(this->exec, mtx); + const auto size = mtx->get_size(); + const auto skeleton = matrix_type::create(this->ref, size, size[0]); + const auto dskeleton = matrix_type::create(this->exec, size, size[0]); + + gko::kernels::reference::elimination_forest::compute_skeleton_tree( + this->ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + size[0], skeleton->get_row_ptrs(), skeleton->get_col_idxs()); + gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest:: + compute_skeleton_tree(this->exec, dmtx->get_const_row_ptrs(), + dmtx->get_const_col_idxs(), size[0], + dskeleton->get_row_ptrs(), + dskeleton->get_col_idxs()); + + // check that the created graphs are trees and have the same edge sum + const auto weight_sum = this->check_mst(skeleton); + const auto dweight_sum = this->check_mst(dskeleton); + ASSERT_EQ(weight_sum, dweight_sum); + // while the MSTs may be different, they should produce the same + // elimination forest as the original matrix + std::unique_ptr forest; + std::unique_ptr skeleton_forest; + std::unique_ptr dskeleton_forest; + gko::factorization::compute_elimination_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(skeleton.get(), + skeleton_forest); + gko::factorization::compute_elimination_forest(dskeleton.get(), + dskeleton_forest); + // the parents array fully determines the elimination forest + GKO_ASSERT_ARRAY_EQ(forest->parents, skeleton_forest->parents); + GKO_ASSERT_ARRAY_EQ(skeleton_forest->parents, + dskeleton_forest->parents); + } +} + + TYPED_TEST(CholeskySymbolic, KernelSymbolicCount) { using matrix_type = typename TestFixture::matrix_type; @@ -136,8 +212,8 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount) const auto dmtx = gko::clone(this->exec, mtx); std::unique_ptr forest; std::unique_ptr dforest; - gko::factorization::compute_elim_forest(mtx.get(), forest); - gko::factorization::compute_elim_forest(dmtx.get(), dforest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(dmtx.get(), dforest); gko::array row_nnz{this->ref, mtx->get_size()[0]}; gko::array drow_nnz{this->exec, mtx->get_size()[0]}; @@ -163,7 +239,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize) const auto dmtx = gko::clone(this->exec, mtx); const auto num_rows = mtx->get_size()[0]; std::unique_ptr forest; - gko::factorization::compute_elim_forest(mtx.get(), forest); + gko::factorization::compute_elimination_forest(mtx.get(), forest); gko::array row_ptrs{this->ref, num_rows + 1}; gko::kernels::reference::cholesky::symbolic_count( this->ref, mtx.get(), *forest, row_ptrs.get_data(), this->tmp); @@ -180,7 +256,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize) gko::array{this->exec, nnz}, row_ptrs); // need to call the device kernels to initialize dtmp std::unique_ptr dforest; - gko::factorization::compute_elim_forest(dmtx.get(), dforest); + gko::factorization::compute_elimination_forest(dmtx.get(), dforest); gko::array dtmp_ptrs{this->exec, num_rows + 1}; gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count( this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp); @@ -232,7 +308,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks) elimination_forest dforest{this->exec, static_cast(mtx->get_size()[0])}; - gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor( + gko::kernels::GKO_DEVICE_NAMESPACE::elimination_forest::from_factor( this->exec, dfactors.get(), dforest); this->assert_equal_forests(*forest, dforest); @@ -280,8 +356,9 @@ class Cholesky : public CommonTestFixture { mtx_chol_sparsity->copy_from(mtx_chol.get()); dmtx_chol_sparsity = sparsity_pattern_type::create(exec); dmtx_chol_sparsity->copy_from(mtx_chol_sparsity.get()); - gko::factorization::compute_elim_forest(mtx_chol.get(), forest); - gko::factorization::compute_elim_forest(dmtx_chol.get(), dforest); + gko::factorization::compute_elimination_forest(mtx_chol.get(), forest); + gko::factorization::compute_elimination_forest(dmtx_chol.get(), + dforest); } void forall_matrices(std::function fn)