From f8cadcf422a367442e1aa091be7e988e8b002163 Mon Sep 17 00:00:00 2001
From: linhu-nv <141609318+linhu-nv@users.noreply.github.com>
Date: Tue, 30 Apr 2024 18:21:25 +0800
Subject: [PATCH] remove unnecessary sync between thrust ops and host threads
 (#160)

fix to issue 148[https://github.com/rapidsai/wholegraph/issues/148](url), remove unnecessary sync between thrust ops and host cpu threads

Authors:
  - https://github.com/linhu-nv

Approvers:
  - Chuang Zhu (https://github.com/chuangz0)

URL: https://github.com/rapidsai/wholegraph/pull/160
---
 cpp/src/graph_ops/append_unique_func.cuh                  | 4 ++--
 .../unweighted_sample_without_replacement_func.cuh        | 4 ++--
 .../weighted_sample_without_replacement_func.cuh          | 6 +++---
 .../functions/exchange_embeddings_nccl_func.cu            | 4 ++--
 .../wholememory_ops/functions/exchange_ids_nccl_func.cu   | 8 +++++---
 .../functions/nvshmem_gather_scatter_func.cuh             | 4 ++--
 6 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/src/graph_ops/append_unique_func.cuh b/cpp/src/graph_ops/append_unique_func.cuh
index ff623a22b..761fabb63 100644
--- a/cpp/src/graph_ops/append_unique_func.cuh
+++ b/cpp/src/graph_ops/append_unique_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -316,7 +316,7 @@ void graph_append_unique_func(void* target_nodes_ptr,
     <<<num_blocks, kAssignThreadBlockSize, 0, stream>>>(value_id, bucket_count_ptr);
   WM_CUDA_CHECK(cudaGetLastError());
   wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
-  thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
+  thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
                          bucket_count_ptr,
                          bucket_count_ptr + num_bucket_count,
                          (int*)bucket_prefix_sum_ptr);
diff --git a/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh
index 291b26b2d..be0b261be 100644
--- a/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh
+++ b/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -337,7 +337,7 @@ void wholegraph_csr_unweighted_sample_without_replacement_func(
 
   // prefix sum
   wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
-  thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
+  thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
                          tmp_sample_count_mem_pointer,
                          tmp_sample_count_mem_pointer + center_node_count + 1,
                          (int*)output_sample_offset);
diff --git a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
index de75d7394..057d4c0c4 100644
--- a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
+++ b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -462,7 +462,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
 
   // prefix sum
   wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
-  thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
+  thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
                          tmp_sample_count_mem_pointer,
                          tmp_sample_count_mem_pointer + center_node_count + 1,
                          static_cast<int*>(output_sample_offset));
@@ -500,7 +500,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   raft::random::detail::DeviceState<raft::random::detail::PCGenerator> rngstate(_rngstate);
   if (max_sample_count > sample_count_threshold) {
     wholememory_ops::wm_thrust_allocator tmp_thrust_allocator(p_env_fns);
-    thrust::exclusive_scan(thrust::cuda::par(tmp_thrust_allocator).on(stream),
+    thrust::exclusive_scan(thrust::cuda::par_nosync(tmp_thrust_allocator).on(stream),
                            tmp_neighbor_counts_mem_pointer,
                            tmp_neighbor_counts_mem_pointer + center_node_count + 1,
                            tmp_neighbor_counts_mem_pointer);
diff --git a/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu b/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu
index 7cb96bcb4..88d7f331c 100644
--- a/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu
+++ b/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,7 @@ void dedup_indice_and_gradients_temp_func(int64_t* run_count,
   int* dev_mapping_sequence =
     static_cast<int*>(mapping_sequence_handle.device_malloc(raw_count * 2, WHOLEMEMORY_DT_INT));
   int* dev_indice_mapping = dev_mapping_sequence + raw_count;
-  thrust::sequence(thrust::cuda::par(allocator).on(stream),
+  thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream),
                    dev_mapping_sequence,
                    dev_mapping_sequence + raw_count,
                    0);
diff --git a/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu b/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu
index 53df31be0..137b10470 100644
--- a/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu
+++ b/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,8 +59,10 @@ void exchange_ids_temp_func(const void* indices_before_sort,
 
   int64_t* seq_indices = reinterpret_cast<int64_t*>(allocator.allocate(
     wholememory_get_memory_element_count_from_array(&indices_desc) * sizeof(int64_t)));
-  thrust::sequence(
-    thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indices_desc.size, 0);
+  thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream),
+                   seq_indices,
+                   seq_indices + indices_desc.size,
+                   0);
   // use UTypeT to put minus indices at last.
   using UTypeT                  = typename UnsignedType<IndexT>::UType;
   const UTypeT* indices_to_sort = static_cast<const UTypeT*>(indices_before_sort);
diff --git a/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh b/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh
index ea905cd93..a0091c31c 100644
--- a/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh
+++ b/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ void sort_index_in_pair(const void* indices_before_sort,
   IndexT* seq_indices =
     reinterpret_cast<IndexT*>(allocator.allocate(indice_count * sizeof(IndexT)));
   thrust::sequence(
-    thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indice_count, 0);
+    thrust::cuda::par_nosync(allocator).on(stream), seq_indices, seq_indices + indice_count, 0);
   // TODO: use unsigned type (wm_ops::UTypeT) can put all negative indices at last. But maybe
   // later... using UTypeT = typename UnsignedType<IndexT>::UType;
   auto indices_to_sort      = static_cast<const IndexT*>(indices_before_sort);