diff --git a/c++/nda/mpi/broadcast.hpp b/c++/nda/mpi/broadcast.hpp index a40086bb..c8436897 100644 --- a/c++/nda/mpi/broadcast.hpp +++ b/c++/nda/mpi/broadcast.hpp @@ -45,17 +45,7 @@ namespace nda { * - the array/view is not contiguous with positive strides or * - one of the MPI calls fails. * - * @code{.cpp} - * // create an array on all processes - * nda::array A(3, 4); - * - * // ... - * // fill array on root process - * // ... - * - * // broadcast the array to all processes - * mpi::broadcast(A); - * @endcode + * See @ref ex6_p1 for an example. * * @tparam A nda::basic_array or nda::basic_array_view type. * @param a Array/view to be broadcasted from/into. diff --git a/c++/nda/mpi/gather.hpp b/c++/nda/mpi/gather.hpp index d96db74c..601dc259 100644 --- a/c++/nda/mpi/gather.hpp +++ b/c++/nda/mpi/gather.hpp @@ -61,7 +61,38 @@ namespace nda { * @{ */ - // Helper function that (all)gathers arrays/views. + /** + * @brief Implementation of an MPI gather for nda::basic_array or nda::basic_array_view types using a C-style API. + * + * @details The function gathers C-ordered input arrays/views from all processes in the given communicator and + * makes the result available on the root process (`all == false`) or on all processes (`all == true`). The + * arrays/views are joined along the first dimension. + * + * It is expected that all input arrays/views have the same shape on all processes except for the first dimension. The + * function throws an exception, if + * - the input array/view is not contiguous with positive strides, + * - the output array/view is not contiguous with positive strides on receiving ranks, + * - the output view does not have the correct shape on receiving ranks or + * - any of the MPI calls fails. + * + * The input arrays/views are simply concatenated along their first dimension. The content of the output array/view + * depends on the MPI rank and whether it receives the data or not: + * - On receiving ranks, it contains the gathered data and has a shape that is the same as the shape of the input + * array/view except for the first dimension, which is the sum of the extents of all input arrays/views along the + * first dimension. + * - On non-receiving ranks, the output array/view is ignored and left unchanged. + * + * If `mpi::has_env` is false or if the communicator size is < 2, it simply copies the input array/view to the output + * array/view. + * + * @tparam A1 nda::basic_array or nda::basic_array_view type with C-layout. + * @tparam A2 nda::basic_array or nda::basic_array_view type with C-layout. + * @param a_in Array/view to be gathered. + * @param a_out Array/view to gather into. + * @param comm `mpi::communicator` object. + * @param root Rank of the root process. + * @param all Should all processes receive the result of the gather. + */ template requires(is_regular_or_view_v and std::decay_t::is_stride_order_C() and is_regular_or_view_v and std::decay_t::is_stride_order_C()) @@ -103,21 +134,9 @@ namespace nda { * * @details This function is lazy, i.e. it returns an mpi::lazy object without performing the * actual MPI operation. Since the returned object models an nda::ArrayInitializer, it can be used to - * initialize/assign to nda::basic_array and nda::basic_array_view objects: - * - * @code{.cpp} - * // create an array on all processes - * nda::array A(3, 4); - * - * // ... - * // fill array on each process - * // ... + * initialize/assign to nda::basic_array and nda::basic_array_view objects. * - * // gather the arrays on the root process - * nda::array B = nda::lazy_mpi_gather(A); - * @endcode - * - * The behavior is otherwise identical to nda::mpi_gather. + * The behavior is otherwise similar to nda::mpi_gather. * * @warning MPI calls are done in the `invoke` and `shape` methods of the `mpi::lazy` object. If one rank calls one of * these methods, all ranks in the communicator need to call the same method. Otherwise, the program will deadlock. @@ -142,30 +161,9 @@ namespace nda { * makes the result available on the root process (`all == false`) or on all processes (`all == true`). The * arrays/views are joined along the first dimension. * - * It is expected that all input arrays/views have the same shape on all processes except for the first dimension. The - * function throws an exception, if - * - the input array/view is not contiguous with positive strides or - * - any of the MPI calls fails. - * - * The input arrays/views are simply concatenated along their first dimension. The shape of the resulting array - * depends on the MPI rank and whether it receives the data or not: - * - On receiving ranks, the shape is the same as the shape of the input array/view except for the first dimension, - * which is the sum of the extents of all input arrays/views along the first dimension. - * - On non-receiving ranks, the shape is empty, i.e. `(0,0,...,0)`. - * - * @code{.cpp} - * // create an array on all processes - * nda::array A(3, 4); - * - * // ... - * // fill array on each process - * // ... - * - * // gather the arrays on the root process - * auto B = mpi::gather(A); - * @endcode + * It simply constructs an empty array and then calls nda::mpi_gather_capi. * - * Here, the array `B` has the shape `(3 * comm.size(), 4)` on the root process and `(0, 0)` on all other processes. + * See @ref ex6_p2 for examples. * * @tparam A nda::basic_array or nda::basic_array_view type with C-layout. * @param a Array/view to be gathered. diff --git a/c++/nda/mpi/reduce.hpp b/c++/nda/mpi/reduce.hpp index 4f18b53e..a3974185 100644 --- a/c++/nda/mpi/reduce.hpp +++ b/c++/nda/mpi/reduce.hpp @@ -76,7 +76,39 @@ namespace nda { * @{ */ - // Helper function that (all)reduces arrays/views. + /** + * @brief Implementation of an MPI reduce for nda::basic_array or nda::basic_array_view types using a C-style API. + * + * @details The function reduces input arrays/views from all processes in the given communicator and makes the result + * available on the root process (`all == false`) or on all processes (`all == true`). + * + * It is expected that all input arrays/views have the same shape on all processes. The function throws an exception, + * if + * - the input array/view is not contiguous with positive strides (only for MPI compatible types), + * - the output array/view is not contiguous with positive strides on receiving ranks, + * - the output view does not have the correct shape on receiving ranks, + * - the data storage of the output array/view overlaps with the input array/view or + * - any of the MPI calls fails. + * + * The content of the output array/view depends on the MPI rank and whether it receives the data or not: + * - On receiving ranks, it contains the reduced data and has a shape that is the same as the shape of the input + * array/view. + * - On non-receiving ranks, the output array/view is ignored and left unchanged. + * + * Types which cannot be reduced directly, i.e. which do not have an MPI type, are reduced element-wise. + * + * If `mpi::has_env` is false or if the communicator size is < 2, it simply copies the input array/view to the output + * array/view. + * + * @tparam A1 nda::basic_array or nda::basic_array_view type. + * @tparam A2 nda::basic_array or nda::basic_array_view type. + * @param a_in Array/view to be reduced. + * @param a_out Array/view to reduce into. + * @param comm `mpi::communicator` object. + * @param root Rank of the root process. + * @param all Should all processes receive the result of the reduction. + * @param op MPI reduction operation. + */ template requires(is_regular_or_view_v && is_regular_or_view_v) void mpi_reduce_capi(A1 const &a_in, A2 &&a_out, mpi::communicator comm = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) { // NOLINT @@ -117,19 +149,7 @@ namespace nda { * actual MPI operation. Since the returned object models an nda::ArrayInitializer, it can be used to * initialize/assign to nda::basic_array and nda::basic_array_view objects: * - * @code{.cpp} - * // create an array on all processes - * nda::array A(3, 4); - * - * // ... - * // fill array on each process - * // ... - * - * // reduce the array on the root process - * nda::array B = nda::lazy_mpi_reduce(A); - * @endcode - * - * The behavior is otherwise identical to nda::mpi_reduce and nda::mpi_reduce_in_place. + * The behavior is otherwise similar to nda::mpi_reduce and nda::mpi_reduce_in_place. * * The reduction is performed in-place if the target and input array/view are the same, e.g. * @@ -161,22 +181,9 @@ namespace nda { * result available on the root process (`all == false`) or on all processes (`all == true`). * * The behavior and requirements are similar to nda::mpi_reduce, except that the function does not return a new array. - * Instead it writes the result directly into the given array/view on receiving ranks: - * - * @code{.cpp} - * // create an array on all processes - * nda::array A(3, 4); - * - * // ... - * // fill array on each process - * // ... + * Instead it writes the result directly into the given array/view on receiving ranks. * - * // reduce the array on the root process - * mpi::reduce_in_place(A); - * @endcode - * - * Here, the array `A` will contain the result of the reduction on the root process (rank 0) and will be the same as - * before the MPI call on all other processes. + * See @ref ex6_p4 for an example. * * @tparam A nda::basic_array or nda::basic_array_view type. * @param a Array/view to be reduced. @@ -197,30 +204,9 @@ namespace nda { * @details The function reduces input arrays/views from all processes in the given communicator and makes the result * available on the root process (`all == false`) or on all processes (`all == true`). * - * It is expected that all input arrays/views have the same shape on all processes. The function throws an exception, - * if - * - the input array/view is not contiguous with positive strides (only for MPI compatible types) or - * - any of the MPI calls fails. - * - * The shape of the resulting array depends on the MPI rank and whether it receives the data or not: - * - On receiving ranks, the shape is the same as the shape of the input array/view. - * - On non-receiving ranks, the shape has the rank of the input array/view with only zeros, i.e. it is empty. - * - * Types which cannot be reduced directly, i.e. which do not have an MPI type, are reduced element-wise. - * - * @code{.cpp} - * // create an array on all processes - * nda::array A(3, 4); - * - * // ... - * // fill array on each process - * // ... - * - * // reduce the array on the root process - * auto B = mpi::reduce(A); - * @endcode + * It simply constructs an empty array and then calls nda::mpi_gather_capi. * - * Here, the array `B` has the shape `(3, 4)` on the root process and `(0, 0)` on all other processes. + * See @ref ex6_p4 for an example. * * @tparam A nda::basic_array or nda::basic_array_view type. * @param a Array/view to be reduced. diff --git a/c++/nda/mpi/scatter.hpp b/c++/nda/mpi/scatter.hpp index 66fd027c..31523d8a 100644 --- a/c++/nda/mpi/scatter.hpp +++ b/c++/nda/mpi/scatter.hpp @@ -61,7 +61,33 @@ namespace nda { * @{ */ - // Helper function that scatters arrays/views. + /** + * @brief Implementation of an MPI scatter for nda::basic_array or nda::basic_array_view types using a C-style API. + * + * @details The function scatters a C-ordered input array/view from a root process across all processes in the given + * communicator. The array/view is chunked into equal parts along the first dimension using `mpi::chunk_length`. + * + * It is expected that all input arrays/views have the same rank on all processes. The function throws an exception, + * if + * - the input array/view is not contiguous with positive strides on the root process, + * - the output array/view is not contiguous with positive strides, + * - the output view does not have the correct shape or + * - any of the MPI calls fails. + * + * The input array/view on the root process is chunked along the first dimension into equal (as much as possible) + * parts using `mpi::chunk_length`. If the extent of the input array along the first dimension is not divisible by the + * number of processes, processes with lower ranks will receive more data than processes with higher ranks. + * + * If `mpi::has_env` is false or if the communicator size is < 2, it simply copies the input array/view to the output + * array/view. + * + * @tparam A1 nda::basic_array or nda::basic_array_view type with C-layout. + * @tparam A2 nda::basic_array or nda::basic_array_view type with C-layout. + * @param a_in Array/view to be scattered. + * @param a_out Array/view to scatter into. + * @param comm `mpi::communicator` object. + * @param root Rank of the root process. + */ template requires(is_regular_or_view_v and std::decay_t::is_stride_order_C() and is_regular_or_view_v and std::decay_t::is_stride_order_C()) @@ -70,7 +96,7 @@ namespace nda { EXPECTS_WITH_MESSAGE(detail::have_mpi_equal_ranks(a_in, comm), "Error in nda::mpi_scatter_capi: Ranks of arrays/views must be equal") // simply copy if there is no active MPI environment or if the communicator size is < 2 - if (not mpi::has_env) { + if (not mpi::has_env || comm.size() < 2) { a_out = a_in; return; } @@ -94,21 +120,9 @@ namespace nda { * * @details This function is lazy, i.e. it returns an mpi::lazy object without performing the * actual MPI operation. Since the returned object models an nda::ArrayInitializer, it can be used to - * initialize/assign to nda::basic_array and nda::basic_array_view objects: - * - * @code{.cpp} - * // create an array on all processes - * nda::array A(10, 4); - * - * // ... - * // fill array on root process - * // ... - * - * // scatter the array to all processes - * nda::array B = nda::lazy_mpi_scatter(A); - * @endcode + * initialize/assign to nda::basic_array and nda::basic_array_view objects. * - * The behavior is otherwise identical to nda::mpi_scatter. + * The behavior is otherwise similar to nda::mpi_scatter. * * @warning MPI calls are done in the `invoke` and `shape` methods of the `mpi::lazy` object. If one rank calls one of * these methods, all ranks in the communicator need to call the same method. Otherwise, the program will deadlock. @@ -131,29 +145,9 @@ namespace nda { * @details The function scatters a C-ordered input array/view from a root process across all processes in the given * communicator. The array/view is chunked into equal parts along the first dimension using `mpi::chunk_length`. * - * It is expected that all input arrays/views have the same rank on all processes. The function throws an exception, - * if - * - the input array/view is not contiguous with positive strides on the root process or - * - if any of the MPI calls fails. - * - * The input array/view on the root process is chunked along the first dimension into equal (as much as possible) - * parts using `mpi::chunk_length`. If the extent of the input array along the first dimension is not divisible by the - * number of processes, processes with lower ranks will receive more data than processes with higher ranks. - * - * @code{.cpp} - * // create an array on all processes - * nda::array A(10, 4); - * - * // ... - * // fill array on root process - * // ... - * - * // scatter the array to all processes - * auto B = mpi::scatter(A); - * @endcode + * It simply constructs an empty array and then calls nda::mpi_scatter_capi. * - * Here, the array `B` has the shape `(10 / comm.size(), 4)` on each process (assuming that 10 is a multiple of - * `comm.size()`). + * See @ref ex6_p3 for an example. * * @tparam A nda::basic_array or nda::basic_array_view type. * @param a Array/view to be scattered.