Skip to content
This repository has been archived by the owner on Nov 25, 2024. It is now read-only.

Commit

Permalink
remove unnecessary sync between thrust ops and host threads (#160)
Browse files Browse the repository at this point in the history
fix to issue 148[https://github.com/rapidsai/wholegraph/issues/148](url), remove unnecessary sync between thrust ops and host cpu threads

Authors:
  - https://github.com/linhu-nv

Approvers:
  - Chuang Zhu (https://github.com/chuangz0)

URL: #160
  • Loading branch information
linhu-nv authored Apr 30, 2024
1 parent 8624b40 commit f8cadcf
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 14 deletions.
4 changes: 2 additions & 2 deletions cpp/src/graph_ops/append_unique_func.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -316,7 +316,7 @@ void graph_append_unique_func(void* target_nodes_ptr,
<<<num_blocks, kAssignThreadBlockSize, 0, stream>>>(value_id, bucket_count_ptr);
WM_CUDA_CHECK(cudaGetLastError());
wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
bucket_count_ptr,
bucket_count_ptr + num_bucket_count,
(int*)bucket_prefix_sum_ptr);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -337,7 +337,7 @@ void wholegraph_csr_unweighted_sample_without_replacement_func(

// prefix sum
wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
tmp_sample_count_mem_pointer,
tmp_sample_count_mem_pointer + center_node_count + 1,
(int*)output_sample_offset);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -462,7 +462,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(

// prefix sum
wholememory_ops::wm_thrust_allocator thrust_allocator(p_env_fns);
thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
thrust::exclusive_scan(thrust::cuda::par_nosync(thrust_allocator).on(stream),
tmp_sample_count_mem_pointer,
tmp_sample_count_mem_pointer + center_node_count + 1,
static_cast<int*>(output_sample_offset));
Expand Down Expand Up @@ -500,7 +500,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
raft::random::detail::DeviceState<raft::random::detail::PCGenerator> rngstate(_rngstate);
if (max_sample_count > sample_count_threshold) {
wholememory_ops::wm_thrust_allocator tmp_thrust_allocator(p_env_fns);
thrust::exclusive_scan(thrust::cuda::par(tmp_thrust_allocator).on(stream),
thrust::exclusive_scan(thrust::cuda::par_nosync(tmp_thrust_allocator).on(stream),
tmp_neighbor_counts_mem_pointer,
tmp_neighbor_counts_mem_pointer + center_node_count + 1,
tmp_neighbor_counts_mem_pointer);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -126,7 +126,7 @@ void dedup_indice_and_gradients_temp_func(int64_t* run_count,
int* dev_mapping_sequence =
static_cast<int*>(mapping_sequence_handle.device_malloc(raw_count * 2, WHOLEMEMORY_DT_INT));
int* dev_indice_mapping = dev_mapping_sequence + raw_count;
thrust::sequence(thrust::cuda::par(allocator).on(stream),
thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream),
dev_mapping_sequence,
dev_mapping_sequence + raw_count,
0);
Expand Down
8 changes: 5 additions & 3 deletions cpp/src/wholememory_ops/functions/exchange_ids_nccl_func.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -59,8 +59,10 @@ void exchange_ids_temp_func(const void* indices_before_sort,

int64_t* seq_indices = reinterpret_cast<int64_t*>(allocator.allocate(
wholememory_get_memory_element_count_from_array(&indices_desc) * sizeof(int64_t)));
thrust::sequence(
thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indices_desc.size, 0);
thrust::sequence(thrust::cuda::par_nosync(allocator).on(stream),
seq_indices,
seq_indices + indices_desc.size,
0);
// use UTypeT to put minus indices at last.
using UTypeT = typename UnsignedType<IndexT>::UType;
const UTypeT* indices_to_sort = static_cast<const UTypeT*>(indices_before_sort);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -80,7 +80,7 @@ void sort_index_in_pair(const void* indices_before_sort,
IndexT* seq_indices =
reinterpret_cast<IndexT*>(allocator.allocate(indice_count * sizeof(IndexT)));
thrust::sequence(
thrust::cuda::par(allocator).on(stream), seq_indices, seq_indices + indice_count, 0);
thrust::cuda::par_nosync(allocator).on(stream), seq_indices, seq_indices + indice_count, 0);
// TODO: use unsigned type (wm_ops::UTypeT) can put all negative indices at last. But maybe
// later... using UTypeT = typename UnsignedType<IndexT>::UType;
auto indices_to_sort = static_cast<const IndexT*>(indices_before_sort);
Expand Down

0 comments on commit f8cadcf

Please sign in to comment.