Skip to content

Commit

Permalink
Merge Add local to global index mapping
Browse files Browse the repository at this point in the history
This PR adds a local-to-global mapping to the `index_map` class. It is necessary for the follow-up PR #1639. This also introduces a device-side view of a partition, and adds another member to the partition which stores the range-ids segmented by their part-id.

Related PR: #1707
  • Loading branch information
MarcelKoch authored Feb 28, 2025
2 parents ebb4725 + 71ac6a4 commit 7404a82
Show file tree
Hide file tree
Showing 26 changed files with 1,045 additions and 28 deletions.
12 changes: 12 additions & 0 deletions benchmark/test/reference/distributed_solver.profile.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,24 @@ DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin partition::build_from_contiguous
DEBUG: end partition::build_from_contiguous
DEBUG: begin partition::build_starting_indices
DEBUG: end partition::build_starting_indices
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin partition::build_ranges_by_part
DEBUG: end partition::build_ranges_by_part
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::prefix_sum_nonnegative
DEBUG: end components::prefix_sum_nonnegative
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
Expand Down
12 changes: 12 additions & 0 deletions benchmark/test/reference/spmv_distributed.profile.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,24 @@ DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin partition::build_from_contiguous
DEBUG: end partition::build_from_contiguous
DEBUG: begin partition::build_starting_indices
DEBUG: end partition::build_starting_indices
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin partition::build_ranges_by_part
DEBUG: end partition::build_ranges_by_part
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::prefix_sum_nonnegative
DEBUG: end components::prefix_sum_nonnegative
DEBUG: begin copy
DEBUG: end copy
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
DEBUG: end components::fill_array
DEBUG: begin components::fill_array
Expand Down
91 changes: 90 additions & 1 deletion common/cuda_hip/distributed/index_map_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -296,6 +296,95 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);


template <typename LocalIndexType, typename GlobalIndexType>
void map_to_global(
std::shared_ptr<const DefaultExecutor> exec,
device_partition<const LocalIndexType, const GlobalIndexType> partition,
device_segmented_array<const GlobalIndexType> remote_global_idxs,
experimental::distributed::comm_index_type rank,
const array<LocalIndexType>& local_idxs,
experimental::distributed::index_space is,
array<GlobalIndexType>& global_idxs)
{
auto range_bounds = partition.offsets_begin;
auto starting_indices = partition.starting_indices_begin;
const auto& ranges_by_part = partition.ranges_by_part;
auto local_idxs_it = local_idxs.get_const_data();
auto input_size = local_idxs.get_size();

auto policy = thrust_policy(exec);

global_idxs.resize_and_reset(local_idxs.get_size());
auto global_idxs_it = global_idxs.get_data();

auto map_local = [rank, ranges_by_part, range_bounds, starting_indices,
partition] __device__(auto lid) {
auto local_size =
static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);

if (lid < 0 || lid >= local_size) {
return invalid_index<GlobalIndexType>();
}

auto local_ranges = ranges_by_part.get_segment(rank);
auto local_ranges_size =
static_cast<int64>(local_ranges.end - local_ranges.begin);

// the binary search finds the first local range, such that the starting
// index is larger than lid, thus lid is contained in the local range
// before that one
auto local_range_id =
binary_search(int64(0), local_ranges_size,
[=](const auto i) {
return starting_indices[local_ranges.begin[i]] >
lid;
}) -
1;
auto range_id = local_ranges.begin[local_range_id];

return static_cast<GlobalIndexType>(lid - starting_indices[range_id]) +
range_bounds[range_id];
};
auto map_non_local = [remote_global_idxs] __device__(auto lid) {
auto remote_size = static_cast<LocalIndexType>(
remote_global_idxs.flat_end - remote_global_idxs.flat_begin);

if (lid < 0 || lid >= remote_size) {
return invalid_index<GlobalIndexType>();
}

return remote_global_idxs.flat_begin[lid];
};
auto map_combined = [map_local, map_non_local, partition,
rank] __device__(auto lid) {
auto local_size =
static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);

if (lid < local_size) {
return map_local(lid);
} else {
return map_non_local(lid - local_size);
}
};

if (is == experimental::distributed::index_space::local) {
thrust::transform(policy, local_idxs_it, local_idxs_it + input_size,
global_idxs_it, map_local);
}
if (is == experimental::distributed::index_space::non_local) {
thrust::transform(policy, local_idxs_it, local_idxs_it + input_size,
global_idxs_it, map_non_local);
}
if (is == experimental::distributed::index_space::combined) {
thrust::transform(policy, local_idxs_it, local_idxs_it + input_size,
global_idxs_it, map_combined);
}
}

GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL);


} // namespace index_map
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
Expand Down
35 changes: 34 additions & 1 deletion common/cuda_hip/distributed/partition_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/distributed/partition_kernels.hpp"

#include <thrust/copy.h>
#include <thrust/count.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/components/atomic.hpp"
#include "common/unified/base/kernel_launch.hpp"
#include "core/components/fill_array_kernels.hpp"

Expand Down Expand Up @@ -132,6 +135,36 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);


void build_ranges_by_part(std::shared_ptr<const DefaultExecutor> exec,
const int* range_parts, size_type num_ranges,
int num_parts, array<size_type>& range_ids,
array<int64>& sizes)
{
auto policy = thrust_policy(exec);

range_ids.resize_and_reset(num_ranges);
auto range_ids_ptr = range_ids.get_data();
thrust::sequence(policy, range_ids_ptr, range_ids_ptr + num_ranges);

// mutable copy of range_parts such that it can be used as keys for sorting
array<int> range_parts_copy{exec, num_ranges};
thrust::copy_n(policy, range_parts, num_ranges,
range_parts_copy.get_data());
auto range_parts_ptr = range_parts_copy.get_data();

thrust::stable_sort_by_key(policy, range_parts_ptr,
range_parts_ptr + num_ranges, range_ids_ptr);

sizes.resize_and_reset(num_parts);
auto sizes_ptr = sizes.get_data();
thrust::fill_n(policy, sizes_ptr, num_parts, 0);
thrust::for_each_n(policy, range_parts_ptr, num_ranges,
[sizes_ptr] __device__(const size_type pid) {
atomic_add(sizes_ptr + pid, int64(1));
});
}


} // namespace partition
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
Expand Down
4 changes: 2 additions & 2 deletions core/base/segmented_array.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand Down Expand Up @@ -29,7 +29,7 @@ struct device_segmented_array {
T* end;
};

constexpr segment get_segment(size_type segment_id)
constexpr segment get_segment(size_type segment_id) const
{
GKO_ASSERT(segment_id < (offsets_end - offsets_begin));
return {flat_begin + offsets_begin[segment_id],
Expand Down
2 changes: 2 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ GKO_STUB_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_MAPPING);
GKO_STUB_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED);
GKO_STUB(GKO_DECLARE_PARTITION_BUILD_RANGES_BY_PART);


} // namespace partition
Expand All @@ -343,6 +344,7 @@ namespace index_map {

GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_BUILD_MAPPING);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);
GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL);


} // namespace index_map
Expand Down
61 changes: 61 additions & 0 deletions core/distributed/device_partition.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2024 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <ginkgo/core/distributed/partition.hpp>

#include "core/base/segmented_array.hpp"

namespace gko {


template <typename LocalIndexType, typename GlobalIndexType>
struct device_partition {
using local_index_type = LocalIndexType;
using global_index_type = GlobalIndexType;
using comm_index_type = experimental::distributed::comm_index_type;

comm_index_type num_parts;
comm_index_type num_empty_parts;
size_type size;
global_index_type* offsets_begin;
global_index_type* offsets_end;
local_index_type* starting_indices_begin;
local_index_type* starting_indices_end;
local_index_type* part_sizes_begin;
local_index_type* part_sizes_end;
const comm_index_type* part_ids_begin;
const comm_index_type* part_ids_end;
device_segmented_array<const size_type> ranges_by_part;
};


/**
* Explicitly create a const version of device_partition.
*/
template <typename LocalIndexType, typename GlobalIndexType>
constexpr device_partition<const LocalIndexType, const GlobalIndexType>
to_device_const(
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
partition)
{
auto num_ranges = partition->get_num_ranges();
auto num_parts = partition->get_num_parts();
return {num_parts,
partition->get_num_empty_parts(),
partition->get_size(),
partition->get_range_bounds(),
partition->get_range_bounds() + num_ranges + 1,
partition->get_range_starting_indices(),
partition->get_range_starting_indices() + num_ranges,
partition->get_part_sizes(),
partition->get_part_sizes() + num_parts,
partition->get_part_ids(),
partition->get_part_ids() + num_parts,
to_device(partition->get_ranges_by_part())};
}


} // namespace gko
18 changes: 17 additions & 1 deletion core/distributed/index_map.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

Expand All @@ -13,6 +13,7 @@ namespace index_map_kernels {

GKO_REGISTER_OPERATION(build_mapping, index_map::build_mapping);
GKO_REGISTER_OPERATION(map_to_local, index_map::map_to_local);
GKO_REGISTER_OPERATION(map_to_global, index_map::map_to_global);


} // namespace index_map_kernels
Expand Down Expand Up @@ -89,6 +90,21 @@ array<LocalIndexType> index_map<LocalIndexType, GlobalIndexType>::map_to_local(
}


template <typename LocalIndexType, typename GlobalIndexType>
array<GlobalIndexType>
index_map<LocalIndexType, GlobalIndexType>::map_to_global(
const array<LocalIndexType>& local_idxs, index_space index_space_v) const
{
array<GlobalIndexType> global_idxs(exec_);

exec_->run(index_map_kernels::make_map_to_global(
to_device_const(partition_.get()), to_device(remote_global_idxs_),
rank_, local_idxs, index_space_v, global_idxs));

return global_idxs;
}


template <typename LocalIndexType, typename GlobalIndexType>
index_map<LocalIndexType, GlobalIndexType>::index_map(
std::shared_ptr<const Executor> exec,
Expand Down
Loading

0 comments on commit 7404a82

Please sign in to comment.