remove unnecessary sync between thrust ops and host threads (#160)

fix to issue 148[https://github.com/rapidsai/wholegraph/issues/148](url), remove unnecessary sync between thrust ops and host cpu threads Authors: - https://github.com/linhu-nv Approvers: - Chuang Zhu (https://github.com/chuangz0) URL: #160
rapidsai · Apr 30, 2024 · f8cadcf · f8cadcf
1 parent 8624b40
commit f8cadcf
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 14 deletions.
diff --git a/cpp/src/graph_ops/append_unique_func.cuh b/cpp/src/graph_ops/append_unique_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -316,7 +316,7 @@ void graph_append_unique_func(void* target_nodes_ptr,
     <<<num_blocks, kAssignThreadBlockSize, 0, stream>>>(value_id, bucket_count_ptr);
   WM_CUDA_CHECK(cudaGetLastError());
   wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
-  thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
+  thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
                          bucket_count_ptr,
                          bucket_count_ptr + num_bucket_count,
                          (int*)bucket_prefix_sum_ptr);

diff --git a/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -337,7 +337,7 @@ void wholegraph_csr_unweighted_sample_without_replacement_func(
 
   // prefix sum
   wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
-  thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
+  thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
                          tmp_sample_count_mem_pointer,
                          tmp_sample_count_mem_pointer + center_node_count + 1,
                          (int*)output_sample_offset);

diff --git a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -462,7 +462,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
 
   // prefix sum
   wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
-  thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
+  thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
                          tmp_sample_count_mem_pointer,
                          tmp_sample_count_mem_pointer + center_node_count + 1,
                          static_cast<int*>(output_sample_offset));
@@ -500,7 +500,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   raft::random::detail::DeviceState<raft::random::detail::PCGenerator> rngstate(_rngstate);
   if (max_sample_count > sample_count_threshold) {
     wholememory_ops::wm_thrust_allocator tmp_thrust_allocator(p_env_fns);
-    thrust::exclusive_scan(thrust::cuda::par(tmp_thrust_allocator).on(stream),
+    thrust::exclusive_scan(thrust::cuda::par_nosync(tmp_thrust_allocator).on(stream),
                            tmp_neighbor_counts_mem_pointer,
                            tmp_neighbor_counts_mem_pointer + center_node_count + 1,
                            tmp_neighbor_counts_mem_pointer);

diff --git a/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu b/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,7 @@ void dedup_indice_and_gradients_temp_func(int64_t* run_count,
   int* dev_mapping_sequence =
     static_cast<int*>(mapping_sequence_handle.device_malloc(raw_count * 2, WHOLEMEMORY_DT_INT));
   int* dev_indice_mapping = dev_mapping_sequence + raw_count;
-  thrust::sequence(thrust::cuda::par(allocator).on(stream),
+  thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream),
                    dev_mapping_sequence,
                    dev_mapping_sequence + raw_count,
                    0);

diff --git a/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu b/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,8 +59,10 @@ void exchange_ids_temp_func(const void* indices_before_sort,
 
   int64_t* seq_indices = reinterpret_cast<int64_t*>(allocator.allocate(
     wholememory_get_memory_element_count_from_array(&indices_desc) * sizeof(int64_t)));
-  thrust::sequence(
-    thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indices_desc.size, 0);
+  thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream),
+                   seq_indices,
+                   seq_indices + indices_desc.size,
+                   0);
   // use UTypeT to put minus indices at last.
   using UTypeT                  = typename UnsignedType<IndexT>::UType;
   const UTypeT* indices_to_sort = static_cast<const UTypeT*>(indices_before_sort);

diff --git a/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh b/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ void sort_index_in_pair(const void* indices_before_sort,
   IndexT* seq_indices =
     reinterpret_cast<IndexT*>(allocator.allocate(indice_count * sizeof(IndexT)));
   thrust::sequence(
-    thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indice_count, 0);
+    thrust::cuda::par_nosync(allocator).on(stream), seq_indices, seq_indices + indice_count, 0);
   // TODO: use unsigned type (wm_ops::UTypeT) can put all negative indices at last. But maybe
   // later... using UTypeT = typename UnsignedType<IndexT>::UType;
   auto indices_to_sort      = static_cast<const IndexT*>(indices_before_sort);