Merge TSAN fixes

This PR includes fixes for errors reported by the TSAN. The changes include: - RCM: locking the write mutex when checking the queue size - SparsityCsr: using atomic load/store when checking of other threads report that the matrix is unsorted - Par IC/ILU (t): use atomic load/stores in the same way as the CUDA/HIP implementations - MG: 'fix' a write-after-write race in the kcycle stopping criteria. Honestly, I don't think this is necessary, but it stops TSAN from complaining. - Batch Solvers: fixed the generation of the 3pt stencil matrix for tests, which caused data races. - PGM: The unified implementation of the `match_edge`kernel has been split up into separate implementations for each backend, and atomic load/stores are added. - NOT FIXED: [PGM] There is a data race in the non-deterministic part of `assign_to_exist_agg`, but I ignored that, since it is marked as non-deterministic anyway. Related PR: ginkgo-project#1743
MarcelKoch · Dec 8, 2024 · 059823f · 059823f
2 parents f95fc48 + 6ad5853
commit 059823f
Show file tree

Hide file tree

Showing 15 changed files with 261 additions and 58 deletions.
diff --git a/common/cuda_hip/multigrid/pgm_kernels.cpp b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -17,6 +17,8 @@
 
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 
 
 namespace gko {
@@ -28,6 +30,44 @@ namespace GKO_DEVICE_NAMESPACE {
  * @ingroup pgm
  */
 namespace pgm {
+namespace kernels {
+
+
+template <typename IndexType>
+__global__ void match_edge(size_type size,
+                           const IndexType* __restrict__ strongest_neighbor,
+                           IndexType* __restrict__ agg)
+{
+    auto tidx = static_cast<IndexType>(thread::get_thread_id_flat<int64>());
+    if (tidx >= size || load_relaxed(agg + tidx) != -1) {
+        return;
+    }
+    auto neighbor = strongest_neighbor[tidx];
+    if (neighbor != -1 && strongest_neighbor[neighbor] == tidx &&
+        tidx <= neighbor) {
+        store_relaxed(agg + tidx, tidx);
+        store_relaxed(agg + neighbor, tidx);
+    }
+}
+
+
+}  // namespace kernels
+
+
+template <typename IndexType>
+void match_edge(std::shared_ptr<const DefaultExecutor> exec,
+                const array<IndexType>& strongest_neighbor,
+                array<IndexType>& agg)
+{
+    constexpr int default_block_size = 512;
+    auto num_blocks = ceildiv(agg.get_size(), default_block_size);
+    kernels::
+        match_edge<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            agg.get_size(), strongest_neighbor.get_const_data(),
+            agg.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
 
 
 template <typename IndexType>

diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
@@ -23,31 +23,6 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace pgm {
 
 
-template <typename IndexType>
-void match_edge(std::shared_ptr<const DefaultExecutor> exec,
-                const array<IndexType>& strongest_neighbor,
-                array<IndexType>& agg)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto tidx, auto strongest_neighbor_vals, auto agg_vals) {
-            if (agg_vals[tidx] != -1) {
-                return;
-            }
-            auto neighbor = strongest_neighbor_vals[tidx];
-            if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx &&
-                tidx <= neighbor) {
-                // Use the smaller index as agg point
-                agg_vals[tidx] = tidx;
-                agg_vals[neighbor] = tidx;
-            }
-        },
-        agg.get_size(), strongest_neighbor.get_const_data(), agg.get_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
-
-
 template <typename IndexType>
 void count_unagg(std::shared_ptr<const DefaultExecutor> exec,
                  const array<IndexType>& agg, IndexType* num_unagg)

diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
@@ -105,7 +105,7 @@ std::unique_ptr<MatrixType> generate_3pt_stencil_batch_matrix(
         {}};
     for (int row = 0; row < num_rows; ++row) {
         if (row > 0) {
-            data.nonzeros.emplace_back(row - 1, row, value_type{-1.0});
+            data.nonzeros.emplace_back(row, row - 1, value_type{-1.0});
         }
         data.nonzeros.emplace_back(row, row, value_type{6.0});
         if (row < num_rows - 1) {

diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp
@@ -278,6 +278,31 @@ __dpct_inline__ T atomic_max(T* __restrict__ addr, T val)
 }
 
 
+template <sycl::access::address_space addressSpace = atomic::global_space,
+          typename T>
+__dpct_inline__ void store(
+    T* __restrict__ addr, T val,
+    sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
+{
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr);
+    obj.store(val, memoryOrder);
+}
+
+
+template <sycl::access::address_space addressSpace = atomic::global_space,
+          typename T>
+__dpct_inline__ T load(T* __restrict__ addr, sycl::memory_order memoryOrder =
+                                                 sycl::memory_order::relaxed)
+{
+    sycl::atomic_ref<T, sycl::memory_order::relaxed,
+                     atomic::memory_scope_v<addressSpace>, addressSpace>
+        obj(*addr);
+    return obj.load(memoryOrder);
+}
+
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko

diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -12,6 +12,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 #include "dpcpp/base/onedpl.hpp"
+#include "dpcpp/components/atomic.dp.hpp"
 
 
 namespace gko {
@@ -25,6 +26,35 @@ namespace dpcpp {
 namespace pgm {
 
 
+template <typename IndexType>
+void match_edge(std::shared_ptr<const DefaultExecutor> exec,
+                const array<IndexType>& strongest_neighbor,
+                array<IndexType>& agg)
+{
+    exec->get_queue()->submit([size = agg.get_size(), agg = agg.get_data(),
+                               strongest_neighbor =
+                                   strongest_neighbor.get_const_data()](
+                                  sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl::range<1>{static_cast<std::size_t>(size)},
+            [=](sycl::id<1> idx_id) {
+                auto tidx = static_cast<IndexType>(idx_id[0]);
+                if (load(agg + tidx, sycl::memory_order_relaxed) != -1) {
+                    return;
+                }
+                auto neighbor = strongest_neighbor[tidx];
+                if (neighbor != -1 && strongest_neighbor[neighbor] == tidx &&
+                    tidx <= neighbor) {
+                    store(agg + tidx, tidx, sycl::memory_order_relaxed);
+                    store(agg + neighbor, tidx, sycl::memory_order_relaxed);
+                }
+            });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);
+
+
 template <typename IndexType>
 void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
               IndexType* row_idxs, IndexType* col_idxs)

diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
@@ -52,7 +52,7 @@ inline ResultType copy_cast(const ValueType& val)
 
 
 template <>
-void atomic_add(half& out, half val)
+inline void atomic_add(half& out, half val)
 {
 #ifdef __NVCOMPILER
 // NVC++ uses atomic capture on uint16 leads the following error.
@@ -85,6 +85,100 @@ void atomic_add(half& out, half val)
 }
 
 
+// There is an error in Clang 17 which prevents us from merging the
+// implementation of double and float. The compiler will throw an error if the
+// templated version is implemented. GCC doesn't throw an error.
+inline void store(double* addr, double val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(float* addr, float val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(int32* addr, int32 val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(int64* addr, int64 val)
+{
+#pragma omp atomic write
+    *addr = val;
+}
+
+inline void store(half* addr, half val)
+{
+    auto uint_addr = copy_cast<uint16_t*>(addr);
+    auto uint_val = copy_cast<uint16_t>(val);
+#pragma omp atomic write
+    *uint_addr = uint_val;
+}
+
+template <typename T>
+inline void store(std::complex<T>* addr, std::complex<T> val)
+{
+    auto values = reinterpret_cast<T*>(addr);
+    store(values + 0, real(val));
+    store(values + 1, imag(val));
+}
+
+
+// Same issue as with the store_helper
+inline float load(float* addr)
+{
+    float val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline double load(double* addr)
+{
+    double val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline int32 load(int32* addr)
+{
+    float val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline int64 load(int64* addr)
+{
+    float val;
+#pragma omp atomic read
+    val = *addr;
+    return val;
+}
+
+inline half load(half* addr)
+{
+    uint16_t uint_val;
+    auto uint_addr = copy_cast<uint16_t*>(addr);
+#pragma omp atomic read
+    uint_val = *uint_addr;
+    return copy_cast<half>(uint_val);
+}
+
+template <typename T>
+inline std::complex<T> load(std::complex<T>* addr)
+{
+    auto values = reinterpret_cast<T*>(addr);
+    return {load(values + 0), load(values + 1)};
+}
+
+
 }  // namespace omp
 }  // namespace kernels
 }  // namespace gko

diff --git a/omp/factorization/par_ic_kernels.cpp b/omp/factorization/par_ic_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/base/utils.hpp"
+#include "omp/components/atomic.hpp"
 
 
 namespace gko {
@@ -76,7 +77,8 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
                     auto l_col = l_col_idxs[l_begin];
                     auto lh_row = l_col_idxs[lh_begin];
                     if (l_col == lh_row && l_col < col) {
-                        sum += l_vals[l_begin] * conj(l_vals[lh_begin]);
+                        sum += load(l_vals + l_begin) *
+                               conj(load(l_vals + lh_begin));
                     }
                     l_begin += (l_col <= lh_row);
                     lh_begin += (lh_row <= l_col);
@@ -85,11 +87,11 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
                 if (row == col) {
                     new_val = sqrt(new_val);
                 } else {
-                    auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+                    auto diag = load(l_vals + l_row_ptrs[col + 1] - 1);
                     new_val = new_val / diag;
                 }
                 if (is_finite(new_val)) {
-                    l_vals[l_nz] = new_val;
+                    store(l_vals + l_nz, new_val);
                 }
             }
         }

diff --git a/omp/factorization/par_ict_kernels.cpp b/omp/factorization/par_ict_kernels.cpp
@@ -17,6 +17,7 @@
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "omp/components/atomic.hpp"
 #include "omp/components/csr_spgeam.hpp"
 
 
@@ -69,7 +70,8 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
                 auto l_col = l_col_idxs[l_begin];
                 auto lh_row = l_col_idxs[lh_begin];
                 if (l_col == lh_row && l_col < col) {
-                    sum += l_vals[l_begin] * conj(l_vals[lh_begin]);
+                    sum +=
+                        load(l_vals + l_begin) * conj(load(l_vals + lh_begin));
                 }
                 if (lh_row == row) {
                     lh_nz = lh_begin;
@@ -81,11 +83,11 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
             if (row == col) {
                 new_val = sqrt(new_val);
             } else {
-                auto diag = l_vals[l_row_ptrs[col + 1] - 1];
+                auto diag = load(l_vals + l_row_ptrs[col + 1] - 1);
                 new_val = new_val / diag;
             }
             if (is_finite(new_val)) {
-                l_vals[l_nz] = new_val;
+                store(l_vals + l_nz, new_val);
             }
         }
     }

diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
@@ -10,6 +10,8 @@
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
+#include "omp/components/atomic.hpp"
+
 
 namespace gko {
 namespace kernels {
@@ -57,7 +59,8 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
                 auto col_l = col_idxs_l[row_l];
                 auto col_u = col_idxs_u[row_u];
                 if (col_l == col_u) {
-                    last_operation = vals_l[row_l] * vals_u[row_u];
+                    last_operation =
+                        load(vals_l + row_l) * load(vals_u + row_u);
                     sum -= last_operation;
                 } else {
                     last_operation = zero<ValueType>();
@@ -74,14 +77,14 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
             sum += last_operation;  // undo the last operation
 
             if (row > col) {  // modify entry in L
-                auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1];
+                auto to_write = sum / load(vals_u + row_ptrs_u[col + 1] - 1);
                 if (is_finite(to_write)) {
-                    vals_l[row_l - 1] = to_write;
+                    store(vals_l + row_l - 1, to_write);
                 }
             } else {  // modify entry in U
                 auto to_write = sum;
                 if (is_finite(to_write)) {
-                    vals_u[row_u - 1] = to_write;
+                    store(vals_u + row_u - 1, to_write);
                 }
             }
         }