diff --git a/c++/nda/mpi/broadcast.hpp b/c++/nda/mpi/broadcast.hpp
index a40086bb..c8436897 100644
--- a/c++/nda/mpi/broadcast.hpp
+++ b/c++/nda/mpi/broadcast.hpp
@@ -45,17 +45,7 @@ namespace nda {
    * - the array/view is not contiguous with positive strides or
    * - one of the MPI calls fails.
    *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(3, 4);
-   *
-   * // ...
-   * // fill array on root process
-   * // ...
-   *
-   * // broadcast the array to all processes
-   * mpi::broadcast(A);
-   * @endcode
+   * See @ref ex6_p1 for an example.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
    * @param a Array/view to be broadcasted from/into.
diff --git a/c++/nda/mpi/gather.hpp b/c++/nda/mpi/gather.hpp
index d96db74c..601dc259 100644
--- a/c++/nda/mpi/gather.hpp
+++ b/c++/nda/mpi/gather.hpp
@@ -61,7 +61,38 @@ namespace nda {
    * @{
    */
 
-  // Helper function that (all)gathers arrays/views.
+  /**
+   * @brief Implementation of an MPI gather for nda::basic_array or nda::basic_array_view types using a C-style API.
+   *
+   * @details The function gathers C-ordered input arrays/views from all processes in the given communicator and
+   * makes the result available on the root process (`all == false`) or on all processes (`all == true`). The
+   * arrays/views are joined along the first dimension.
+   *
+   * It is expected that all input arrays/views have the same shape on all processes except for the first dimension. The
+   * function throws an exception, if
+   * - the input array/view is not contiguous with positive strides,
+   * - the output array/view is not contiguous with positive strides on receiving ranks,
+   * - the output view does not have the correct shape on receiving ranks or
+   * - any of the MPI calls fails.
+   *
+   * The input arrays/views are simply concatenated along their first dimension. The content of the output array/view
+   * depends on the MPI rank and whether it receives the data or not:
+   * - On receiving ranks, it contains the gathered data and has a shape that is the same as the shape of the input
+   * array/view except for the first dimension, which is the sum of the extents of all input arrays/views along the
+   * first dimension.
+   * - On non-receiving ranks, the output array/view is ignored and left unchanged.
+   *
+   * If `mpi::has_env` is false or if the communicator size is < 2, it simply copies the input array/view to the output
+   * array/view.
+   *
+   * @tparam A1 nda::basic_array or nda::basic_array_view type with C-layout.
+   * @tparam A2 nda::basic_array or nda::basic_array_view type with C-layout.
+   * @param a_in Array/view to be gathered.
+   * @param a_out Array/view to gather into.
+   * @param comm `mpi::communicator` object.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the gather.
+   */
   template <typename A1, typename A2>
     requires(is_regular_or_view_v<A1> and std::decay_t<A1>::is_stride_order_C()
              and is_regular_or_view_v<A2> and std::decay_t<A2>::is_stride_order_C())
@@ -103,21 +134,9 @@ namespace nda {
    *
    * @details This function is lazy, i.e. it returns an mpi::lazy<mpi::tag::gather, A> object without performing the
    * actual MPI operation. Since the returned object models an nda::ArrayInitializer, it can be used to
-   * initialize/assign to nda::basic_array and nda::basic_array_view objects:
-   *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(3, 4);
-   *
-   * // ...
-   * // fill array on each process
-   * // ...
+   * initialize/assign to nda::basic_array and nda::basic_array_view objects.
    *
-   * // gather the arrays on the root process
-   * nda::array<int, 2> B = nda::lazy_mpi_gather(A);
-   * @endcode
-   *
-   * The behavior is otherwise identical to nda::mpi_gather.
+   * The behavior is otherwise similar to nda::mpi_gather.
    *
    * @warning MPI calls are done in the `invoke` and `shape` methods of the `mpi::lazy` object. If one rank calls one of
    * these methods, all ranks in the communicator need to call the same method. Otherwise, the program will deadlock.
@@ -142,30 +161,9 @@ namespace nda {
    * makes the result available on the root process (`all == false`) or on all processes (`all == true`). The
    * arrays/views are joined along the first dimension.
    *
-   * It is expected that all input arrays/views have the same shape on all processes except for the first dimension. The
-   * function throws an exception, if
-   * - the input array/view is not contiguous with positive strides or
-   * - any of the MPI calls fails.
-   *
-   * The input arrays/views are simply concatenated along their first dimension. The shape of the resulting array
-   * depends on the MPI rank and whether it receives the data or not:
-   * - On receiving ranks, the shape is the same as the shape of the input array/view except for the first dimension,
-   * which is the sum of the extents of all input arrays/views along the first dimension.
-   * - On non-receiving ranks, the shape is empty, i.e. `(0,0,...,0)`.
-   *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(3, 4);
-   *
-   * // ...
-   * // fill array on each process
-   * // ...
-   *
-   * // gather the arrays on the root process
-   * auto B = mpi::gather(A);
-   * @endcode
+   * It simply constructs an empty array and then calls nda::mpi_gather_capi.
    *
-   * Here, the array `B` has the shape `(3 * comm.size(), 4)` on the root process and `(0, 0)` on all other processes.
+   * See @ref ex6_p2 for examples.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type with C-layout.
    * @param a Array/view to be gathered.
diff --git a/c++/nda/mpi/reduce.hpp b/c++/nda/mpi/reduce.hpp
index 4f18b53e..a3974185 100644
--- a/c++/nda/mpi/reduce.hpp
+++ b/c++/nda/mpi/reduce.hpp
@@ -76,7 +76,39 @@ namespace nda {
    * @{
    */
 
-  // Helper function that (all)reduces arrays/views.
+  /**
+   * @brief Implementation of an MPI reduce for nda::basic_array or nda::basic_array_view types using a C-style API.
+   *
+   * @details The function reduces input arrays/views from all processes in the given communicator and makes the result
+   * available on the root process (`all == false`) or on all processes (`all == true`).
+   *
+   * It is expected that all input arrays/views have the same shape on all processes. The function throws an exception,
+   * if
+   * - the input array/view is not contiguous with positive strides (only for MPI compatible types),
+   * - the output array/view is not contiguous with positive strides on receiving ranks,
+   * - the output view does not have the correct shape on receiving ranks,
+   * - the data storage of the output array/view overlaps with the input array/view or
+   * - any of the MPI calls fails.
+   *
+   * The content of the output array/view depends on the MPI rank and whether it receives the data or not:
+   * - On receiving ranks, it contains the reduced data and has a shape that is the same as the shape of the input
+   * array/view.
+   * - On non-receiving ranks, the output array/view is ignored and left unchanged.
+   *
+   * Types which cannot be reduced directly, i.e. which do not have an MPI type, are reduced element-wise.
+   *
+   * If `mpi::has_env` is false or if the communicator size is < 2, it simply copies the input array/view to the output
+   * array/view.
+   *
+   * @tparam A1 nda::basic_array or nda::basic_array_view type.
+   * @tparam A2 nda::basic_array or nda::basic_array_view type.
+   * @param a_in Array/view to be reduced.
+   * @param a_out Array/view to reduce into.
+   * @param comm `mpi::communicator` object.
+   * @param root Rank of the root process.
+   * @param all Should all processes receive the result of the reduction.
+   * @param op MPI reduction operation.
+   */
   template <typename A1, typename A2>
     requires(is_regular_or_view_v<A1> && is_regular_or_view_v<A2>)
   void mpi_reduce_capi(A1 const &a_in, A2 &&a_out, mpi::communicator comm = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) { // NOLINT
@@ -117,19 +149,7 @@ namespace nda {
    * actual MPI operation. Since the returned object models an nda::ArrayInitializer, it can be used to
    * initialize/assign to nda::basic_array and nda::basic_array_view objects:
    *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(3, 4);
-   *
-   * // ...
-   * // fill array on each process
-   * // ...
-   *
-   * // reduce the array on the root process
-   * nda::array<int, 2> B = nda::lazy_mpi_reduce(A);
-   * @endcode
-   *
-   * The behavior is otherwise identical to nda::mpi_reduce and nda::mpi_reduce_in_place.
+   * The behavior is otherwise similar to nda::mpi_reduce and nda::mpi_reduce_in_place.
    *
    * The reduction is performed in-place if the target and input array/view are the same, e.g.
    *
@@ -161,22 +181,9 @@ namespace nda {
    * result available on the root process (`all == false`) or on all processes (`all == true`).
    *
    * The behavior and requirements are similar to nda::mpi_reduce, except that the function does not return a new array.
-   * Instead it writes the result directly into the given array/view on receiving ranks:
-   *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(3, 4);
-   *
-   * // ...
-   * // fill array on each process
-   * // ...
+   * Instead it writes the result directly into the given array/view on receiving ranks.
    *
-   * // reduce the array on the root process
-   * mpi::reduce_in_place(A);
-   * @endcode
-   *
-   * Here, the array `A` will contain the result of the reduction on the root process (rank 0) and will be the same as
-   * before the MPI call on all other processes.
+   * See @ref ex6_p4 for an example.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
    * @param a Array/view to be reduced.
@@ -197,30 +204,9 @@ namespace nda {
    * @details The function reduces input arrays/views from all processes in the given communicator and makes the result
    * available on the root process (`all == false`) or on all processes (`all == true`).
    *
-   * It is expected that all input arrays/views have the same shape on all processes. The function throws an exception,
-   * if
-   * - the input array/view is not contiguous with positive strides (only for MPI compatible types) or
-   * - any of the MPI calls fails.
-   *
-   * The shape of the resulting array depends on the MPI rank and whether it receives the data or not:
-   * - On receiving ranks, the shape is the same as the shape of the input array/view.
-   * - On non-receiving ranks, the shape has the rank of the input array/view with only zeros, i.e. it is empty.
-   *
-   * Types which cannot be reduced directly, i.e. which do not have an MPI type, are reduced element-wise.
-   *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(3, 4);
-   *
-   * // ...
-   * // fill array on each process
-   * // ...
-   *
-   * // reduce the array on the root process
-   * auto B = mpi::reduce(A);
-   * @endcode
+   * It simply constructs an empty array and then calls nda::mpi_gather_capi.
    *
-   * Here, the array `B` has the shape `(3, 4)` on the root process and `(0, 0)` on all other processes.
+   * See @ref ex6_p4 for an example.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
    * @param a Array/view to be reduced.
diff --git a/c++/nda/mpi/scatter.hpp b/c++/nda/mpi/scatter.hpp
index 66fd027c..31523d8a 100644
--- a/c++/nda/mpi/scatter.hpp
+++ b/c++/nda/mpi/scatter.hpp
@@ -61,7 +61,33 @@ namespace nda {
    * @{
    */
 
-  // Helper function that scatters arrays/views.
+  /**
+   * @brief Implementation of an MPI scatter for nda::basic_array or nda::basic_array_view types using a C-style API.
+   *
+   * @details The function scatters a C-ordered input array/view from a root process across all processes in the given
+   * communicator. The array/view is chunked into equal parts along the first dimension using `mpi::chunk_length`.
+   *
+   * It is expected that all input arrays/views have the same rank on all processes. The function throws an exception,
+   * if
+   * - the input array/view is not contiguous with positive strides on the root process,
+   * - the output array/view is not contiguous with positive strides,
+   * - the output view does not have the correct shape or
+   * - any of the MPI calls fails.
+   *
+   * The input array/view on the root process is chunked along the first dimension into equal (as much as possible)
+   * parts using `mpi::chunk_length`. If the extent of the input array along the first dimension is not divisible by the
+   * number of processes, processes with lower ranks will receive more data than processes with higher ranks.
+   *
+   * If `mpi::has_env` is false or if the communicator size is < 2, it simply copies the input array/view to the output
+   * array/view.
+   *
+   * @tparam A1 nda::basic_array or nda::basic_array_view type with C-layout.
+   * @tparam A2 nda::basic_array or nda::basic_array_view type with C-layout.
+   * @param a_in Array/view to be scattered.
+   * @param a_out Array/view to scatter into.
+   * @param comm `mpi::communicator` object.
+   * @param root Rank of the root process.
+   */
   template <typename A1, typename A2>
     requires(is_regular_or_view_v<A1> and std::decay_t<A1>::is_stride_order_C()
              and is_regular_or_view_v<A2> and std::decay_t<A2>::is_stride_order_C())
@@ -70,7 +96,7 @@ namespace nda {
     EXPECTS_WITH_MESSAGE(detail::have_mpi_equal_ranks(a_in, comm), "Error in nda::mpi_scatter_capi: Ranks of arrays/views must be equal")
 
     // simply copy if there is no active MPI environment or if the communicator size is < 2
-    if (not mpi::has_env) {
+    if (not mpi::has_env || comm.size() < 2) {
       a_out = a_in;
       return;
     }
@@ -94,21 +120,9 @@ namespace nda {
    *
    * @details This function is lazy, i.e. it returns an mpi::lazy<mpi::tag::scatter, A> object without performing the
    * actual MPI operation. Since the returned object models an nda::ArrayInitializer, it can be used to
-   * initialize/assign to nda::basic_array and nda::basic_array_view objects:
-   *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(10, 4);
-   *
-   * // ...
-   * // fill array on root process
-   * // ...
-   *
-   * // scatter the array to all processes
-   * nda::array<int, 2> B = nda::lazy_mpi_scatter(A);
-   * @endcode
+   * initialize/assign to nda::basic_array and nda::basic_array_view objects.
    *
-   * The behavior is otherwise identical to nda::mpi_scatter.
+   * The behavior is otherwise similar to nda::mpi_scatter.
    *
    * @warning MPI calls are done in the `invoke` and `shape` methods of the `mpi::lazy` object. If one rank calls one of
    * these methods, all ranks in the communicator need to call the same method. Otherwise, the program will deadlock.
@@ -131,29 +145,9 @@ namespace nda {
    * @details The function scatters a C-ordered input array/view from a root process across all processes in the given
    * communicator. The array/view is chunked into equal parts along the first dimension using `mpi::chunk_length`.
    *
-   * It is expected that all input arrays/views have the same rank on all processes. The function throws an exception,
-   * if
-   * - the input array/view is not contiguous with positive strides on the root process or
-   * - if any of the MPI calls fails.
-   *
-   * The input array/view on the root process is chunked along the first dimension into equal (as much as possible)
-   * parts using `mpi::chunk_length`. If the extent of the input array along the first dimension is not divisible by the
-   * number of processes, processes with lower ranks will receive more data than processes with higher ranks.
-   *
-   * @code{.cpp}
-   * // create an array on all processes
-   * nda::array<int, 2> A(10, 4);
-   *
-   * // ...
-   * // fill array on root process
-   * // ...
-   *
-   * // scatter the array to all processes
-   * auto B = mpi::scatter(A);
-   * @endcode
+   * It simply constructs an empty array and then calls nda::mpi_scatter_capi.
    *
-   * Here, the array `B` has the shape `(10 / comm.size(), 4)` on each process (assuming that 10 is a multiple of
-   * `comm.size()`).
+   * See @ref ex6_p3 for an example.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
    * @param a Array/view to be scattered.