faster network test

marius-team · Nov 22, 2023 · f5ecbb3 · f5ecbb3
1 parent a93d1de
commit f5ecbb3
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 4 deletions.
diff --git a/src/cpp/src/data/dataloader.cpp b/src/cpp/src/data/dataloader.cpp
@@ -686,8 +686,9 @@ void DataLoader::loadCPUParameters(shared_ptr<Batch> batch, int id, bool load) {
                         torch::Tensor all_unique_nodes = torch::cat({all_unique_nodes_vec}, 0);
                         unique_indices = computeUniques(all_unique_nodes, graph_storage_->getNumNodesInMemory(), id);
 
-                        for (int i = 0; i < batch->sub_batches_.size(); i++) {
-                            batch->sub_batches_[i]->root_node_indices_ = unique_indices;
+                        batch->sub_batches_[0]->root_node_indices_ = unique_indices;
+                        for (int i = 1; i < batch->sub_batches_.size(); i++) {
+                            batch->sub_batches_[i]->root_node_indices_ = torch::Tensor();
                         }
 
 //                        t.stop();
@@ -713,7 +714,7 @@ void DataLoader::loadCPUParameters(shared_ptr<Batch> batch, int id, bool load) {
                                 size = unique_indices.size(0) - start;
                             }
 
-//                            batch->sub_batches_[i]->node_features_ = unique_features.narrow(0, count, size);
+//                            batch->sub_batches_[i]->root_node_indices_ = unique_features.narrow(0, count, size);
                             batch->sub_batches_[i]->node_features_ = graph_storage_->getNodeFeatures(unique_indices.narrow(0, start, size));
                         }
                     }

diff --git a/src/cpp/src/pipeline/pipeline_gpu.cpp b/src/cpp/src/pipeline/pipeline_gpu.cpp
@@ -214,12 +214,14 @@ void ComputeWorkerGPU::run() {
 
                     int unique_size = 0;
                     int feat_dim = batch->sub_batches_[0]->node_features_.size(1);
+                    int root_dim = batch->sub_batches_[0]->root_node_indices_.size(0);
 //                    for (int i = 0; i < batch->sub_batches_.size(); i++) {
 //                        unique_size += batch->sub_batches_[i]->node_features_.size(0);
 //                    }
                     unique_size = batch->sub_batches_[0]->node_features_.size(0) * batch->sub_batches_.size();
                     std::vector<torch::Tensor> inputs(batch->sub_batches_.size());
                     std::vector<torch::Tensor> unique_features_per_gpu(batch->sub_batches_.size());
+                    std::vector<torch::Tensor> broadcast_list(batch->sub_batches_.size());
 
 //                    std::cout<<"start"<<"\n";
 //                    std::cout<<unique_size<<"\n";
@@ -246,6 +248,12 @@ void ComputeWorkerGPU::run() {
 
                             unique_features_per_gpu[i] = unique_node_features;
                             inputs[i] = batch->sub_batches_[i]->node_features_;
+
+                            device_options = torch::TensorOptions().dtype(batch->sub_batches_[0]->root_node_indices_.dtype()).device(batch->sub_batches_[i]->node_features_.device());
+                            if (i > 0)
+                                broadcast_list[i] = torch::zeros({root_dim}, device_options);
+                            else
+                                broadcast_list[i] = batch->sub_batches_[i]->root_node_indices_;
                         }
 
                         #pragma omp single
@@ -254,6 +262,7 @@ void ComputeWorkerGPU::run() {
 
                             #ifdef MARIUS_CUDA
                                 torch::cuda::nccl::all_gather(inputs, unique_features_per_gpu);//, streams);
+                                torch::cuda::nccl::broadcast(broadcast_list);//, streams);
                             #endif
 
 //                            for (int j = 0; j < batch->sub_batches_.size(); j++) {
@@ -272,7 +281,7 @@ void ComputeWorkerGPU::run() {
                             CudaStreamGuard stream_guard(*(pipeline_->dataloader_->compute_streams_[i]));
                             auto device_options = torch::TensorOptions().dtype(batch->sub_batches_[i]->node_features_.dtype()).device(batch->sub_batches_[i]->node_features_.device());
 
-                            batch->sub_batches_[i]->unique_node_indices_ = torch::searchsorted(batch->sub_batches_[i]->root_node_indices_, batch->sub_batches_[i]->unique_node_indices_);
+                            batch->sub_batches_[i]->unique_node_indices_ = torch::searchsorted(broadcast_list[i], batch->sub_batches_[i]->unique_node_indices_);
 
 //                            batch->sub_batches_[i]->node_features_ = torch::zeros({batch->sub_batches_[i]->unique_node_indices_.size(0), feat_dim}, device_options);
 //                            torch::index_select_out(batch->sub_batches_[i]->node_features_, unique_features_per_gpu[i], 0, batch->sub_batches_[i]->unique_node_indices_);