diff --git a/c++/nda/mpi/broadcast.hpp b/c++/nda/mpi/broadcast.hpp
index 22f4cfdc..a40086bb 100644
--- a/c++/nda/mpi/broadcast.hpp
+++ b/c++/nda/mpi/broadcast.hpp
@@ -58,7 +58,7 @@ namespace nda {
    * @endcode
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
-   * @param a Array or view to be broadcasted from/into.
+   * @param a Array/view to be broadcasted from/into.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    */
diff --git a/c++/nda/mpi/gather.hpp b/c++/nda/mpi/gather.hpp
index 046d3fd7..52a36443 100644
--- a/c++/nda/mpi/gather.hpp
+++ b/c++/nda/mpi/gather.hpp
@@ -194,7 +194,7 @@ namespace nda {
    * these methods, all ranks in the communicator need to call the same method. Otherwise, the program will deadlock.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type with C-layout.
-   * @param a Array or view to be gathered.
+   * @param a Array/view to be gathered.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the gather.
@@ -202,7 +202,7 @@ namespace nda {
    */
   template <typename A>
     requires(is_regular_or_view_v<A> and std::decay_t<A>::is_stride_order_C())
-  ArrayInitializer<std::remove_reference_t<A>> auto lazy_mpi_gather(A &&a, mpi::communicator comm = {}, int root = 0, bool all = false) {
+  auto lazy_mpi_gather(A &&a, mpi::communicator comm = {}, int root = 0, bool all = false) {
     return mpi::lazy<mpi::tag::gather, A>{std::forward<A>(a), comm, root, all};
   }
 
@@ -239,7 +239,7 @@ namespace nda {
    * Here, the array `B` has the shape `(3 * comm.size(), 4)` on the root process and `(0, 0)` on all other processes.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type with C-layout.
-   * @param a Array or view to be gathered.
+   * @param a Array/view to be gathered.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the gather.
diff --git a/c++/nda/mpi/reduce.hpp b/c++/nda/mpi/reduce.hpp
index 5a6dfbdc..207818a4 100644
--- a/c++/nda/mpi/reduce.hpp
+++ b/c++/nda/mpi/reduce.hpp
@@ -209,8 +209,9 @@ namespace nda {
    * nda::array<int, 2> B = nda::lazy_mpi_reduce(A);
    * @endcode
    *
-   * The behavior is otherwise identical to nda::mpi_reduce and nda::mpi_reduce_in_place. The reduction is performed
-   * in-place if the target and input array/view are the same (if the underlying data pointer is the same), e.g.
+   * The behavior is otherwise identical to nda::mpi_reduce and nda::mpi_reduce_in_place.
+   *
+   * The reduction is performed in-place if the target and input array/view are the same, e.g.
    *
    * @code{.cpp}
    * A = mpi::reduce(A);
@@ -220,7 +221,7 @@ namespace nda {
    * ranks in the communicator need to call the same method. Otherwise, the program will deadlock.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
-   * @param a Array or view to be reduced.
+   * @param a Array/view to be reduced.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
@@ -229,8 +230,7 @@ namespace nda {
    */
   template <typename A>
     requires(is_regular_or_view_v<A>)
-  ArrayInitializer<std::remove_reference_t<A>> auto lazy_mpi_reduce(A &&a, mpi::communicator comm = {}, int root = 0, bool all = false,
-                                                                    MPI_Op op = MPI_SUM) {
+  auto lazy_mpi_reduce(A &&a, mpi::communicator comm = {}, int root = 0, bool all = false, MPI_Op op = MPI_SUM) {
     return mpi::lazy<mpi::tag::reduce, A>{std::forward<A>(a), comm, root, all, op};
   }
 
@@ -259,7 +259,7 @@ namespace nda {
    * before the MPI call on all other processes.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
-   * @param a Array or view to be reduced.
+   * @param a Array/view to be reduced.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
@@ -303,7 +303,7 @@ namespace nda {
    * Here, the array `B` has the shape `(3, 4)` on the root process and `(0, 0)` on all other processes.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
-   * @param a Array or view to be reduced.
+   * @param a Array/view to be reduced.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @param all Should all processes receive the result of the reduction.
diff --git a/c++/nda/mpi/scatter.hpp b/c++/nda/mpi/scatter.hpp
index 4d7a3451..20915e5a 100644
--- a/c++/nda/mpi/scatter.hpp
+++ b/c++/nda/mpi/scatter.hpp
@@ -185,14 +185,14 @@ namespace nda {
    * these methods, all ranks in the communicator need to call the same method. Otherwise, the program will deadlock.
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
-   * @param a Array or view to be scattered.
+   * @param a Array/view to be scattered.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @return An mpi::lazy<mpi::tag::scatter, A> object modelling an nda::ArrayInitializer.
    */
   template <typename A>
     requires(is_regular_or_view_v<A> and std::decay_t<A>::is_stride_order_C())
-  ArrayInitializer<std::remove_reference_t<A>> auto lazy_mpi_scatter(A &&a, mpi::communicator comm = {}, int root = 0) {
+  auto lazy_mpi_scatter(A &&a, mpi::communicator comm = {}, int root = 0) {
     return mpi::lazy<mpi::tag::scatter, A>{std::forward<A>(a), comm, root, true};
   }
 
@@ -227,7 +227,7 @@ namespace nda {
    * `comm.size()`).
    *
    * @tparam A nda::basic_array or nda::basic_array_view type.
-   * @param a Array or view to be scattered.
+   * @param a Array/view to be scattered.
    * @param comm `mpi::communicator` object.
    * @param root Rank of the root process.
    * @return An nda::basic_array object with the result of the scattering.
diff --git a/doc/groups.dox b/doc/groups.dox
index d5de24e2..b1db31ef 100644
--- a/doc/groups.dox
+++ b/doc/groups.dox
@@ -163,11 +163,11 @@
  * The following example demonstrates some of these features:
  *
  * @code{.cpp}
+ * #include <mpi/mpi.hpp>
  * #include <nda/mpi.hpp>
  * #include <nda/nda.hpp>
  *
  * #include <iostream>
- * #include <mpi/mpi.hpp>
  *
  * int main(int argc, char **argv) {
  *   // initialize MPI environment
@@ -178,14 +178,11 @@
  *   nda::array<int, 2> A(2, 2);
  *   A() = comm.rank();
  *
- *   // reduce the array over all processes, which returns an mpi::lazy proxy object
- *   auto lazy_sum = mpi::reduce(A);
- *
- *   // since it satisfies the nda::ArrayInitializer concept, we can use it to initialize an nda::array
- *   nda::array<int, 2> sum(lazy_sum);
+ *   // reduce the array over all processes
+ *   auto A_sum = mpi::reduce(A);
  *
  *   // print the result
- *   if (comm.rank() == 0) std::cout << sum << std::endl;
+ *   if (comm.rank() == 0) std::cout << A_sum << std::endl;
  * }
  * @endcode
  *