From f8cadcf422a367442e1aa091be7e988e8b002163 Mon Sep 17 00:00:00 2001 From: linhu-nv <141609318+linhu-nv@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:21:25 +0800 Subject: [PATCH] remove unnecessary sync between thrust ops and host threads (#160) fix to issue 148[https://github.com/rapidsai/wholegraph/issues/148](url), remove unnecessary sync between thrust ops and host cpu threads Authors: - https://github.com/linhu-nv Approvers: - Chuang Zhu (https://github.com/chuangz0) URL: https://github.com/rapidsai/wholegraph/pull/160 --- cpp/src/graph_ops/append_unique_func.cuh | 4 ++-- .../unweighted_sample_without_replacement_func.cuh | 4 ++-- .../weighted_sample_without_replacement_func.cuh | 6 +++--- .../functions/exchange_embeddings_nccl_func.cu | 4 ++-- .../wholememory_ops/functions/exchange_ids_nccl_func.cu | 8 +++++--- .../functions/nvshmem_gather_scatter_func.cuh | 4 ++-- 6 files changed, 16 insertions(+), 14 deletions(-) diff --git a/cpp/src/graph_ops/append_unique_func.cuh b/cpp/src/graph_ops/append_unique_func.cuh index ff623a22b..761fabb63 100644 --- a/cpp/src/graph_ops/append_unique_func.cuh +++ b/cpp/src/graph_ops/append_unique_func.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -316,7 +316,7 @@ void graph_append_unique_func(void* target_nodes_ptr, <<>>(value_id, bucket_count_ptr); WM_CUDA_CHECK(cudaGetLastError()); wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns); - thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream), + thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream), bucket_count_ptr, bucket_count_ptr + num_bucket_count, (int*)bucket_prefix_sum_ptr); diff --git a/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh index 291b26b2d..be0b261be 100644 --- a/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh +++ b/cpp/src/wholegraph_ops/unweighted_sample_without_replacement_func.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -337,7 +337,7 @@ void wholegraph_csr_unweighted_sample_without_replacement_func( // prefix sum wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns); - thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream), + thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream), tmp_sample_count_mem_pointer, tmp_sample_count_mem_pointer + center_node_count + 1, (int*)output_sample_offset); diff --git a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh index de75d7394..057d4c0c4 100644 --- a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh +++ b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -462,7 +462,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func( // prefix sum wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns); - thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream), + thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream), tmp_sample_count_mem_pointer, tmp_sample_count_mem_pointer + center_node_count + 1, static_cast(output_sample_offset)); @@ -500,7 +500,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func( raft::random::detail::DeviceState rngstate(_rngstate); if (max_sample_count > sample_count_threshold) { wholememory_ops::wm_thrust_allocator tmp_thrust_allocator(p_env_fns); - thrust::exclusive_scan(thrust::cuda::par(tmp_thrust_allocator).on(stream), + thrust::exclusive_scan(thrust::cuda::par_nosync(tmp_thrust_allocator).on(stream), tmp_neighbor_counts_mem_pointer, tmp_neighbor_counts_mem_pointer + center_node_count + 1, tmp_neighbor_counts_mem_pointer); diff --git a/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu b/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu index 7cb96bcb4..88d7f331c 100644 --- a/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu +++ b/cpp/src/wholememory_ops/functions/exchange_embeddings_nccl_func.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -126,7 +126,7 @@ void dedup_indice_and_gradients_temp_func(int64_t* run_count, int* dev_mapping_sequence = static_cast(mapping_sequence_handle.device_malloc(raw_count * 2, WHOLEMEMORY_DT_INT)); int* dev_indice_mapping = dev_mapping_sequence + raw_count; - thrust::sequence(thrust::cuda::par(allocator).on(stream), + thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream), dev_mapping_sequence, dev_mapping_sequence + raw_count, 0); diff --git a/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu b/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu index 53df31be0..137b10470 100644 --- a/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu +++ b/cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,8 +59,10 @@ void exchange_ids_temp_func(const void* indices_before_sort, int64_t* seq_indices = reinterpret_cast(allocator.allocate( wholememory_get_memory_element_count_from_array(&indices_desc) * sizeof(int64_t))); - thrust::sequence( - thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indices_desc.size, 0); + thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream), + seq_indices, + seq_indices + indices_desc.size, + 0); // use UTypeT to put minus indices at last. using UTypeT = typename UnsignedType::UType; const UTypeT* indices_to_sort = static_cast(indices_before_sort); diff --git a/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh b/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh index ea905cd93..a0091c31c 100644 --- a/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh +++ b/cpp/src/wholememory_ops/functions/nvshmem_gather_scatter_func.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,7 +80,7 @@ void sort_index_in_pair(const void* indices_before_sort, IndexT* seq_indices = reinterpret_cast(allocator.allocate(indice_count * sizeof(IndexT))); thrust::sequence( - thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indice_count, 0); + thrust::cuda::par_nosync(allocator).on(stream), seq_indices, seq_indices + indice_count, 0); // TODO: use unsigned type (wm_ops::UTypeT) can put all negative indices at last. But maybe // later... using UTypeT = typename UnsignedType::UType; auto indices_to_sort = static_cast(indices_before_sort);