From 75a089aaed7e7119ce7a50a456cca1c9755e435f Mon Sep 17 00:00:00 2001
From: Thomas Hahn <thomas.hahn01@gmail.com>
Date: Thu, 18 Jul 2024 09:36:59 -0400
Subject: [PATCH] Add examples to docs

---
 c++/nda/lapack/getrf.hpp |   2 +-
 c++/nda/lapack/getrs.hpp |   4 +-
 doc/DoxygenLayout.xml    |  14 +-
 doc/ex1.md               | 574 +++++++++++++++++++++++++++++++++++++++
 doc/ex2.md               | 268 ++++++++++++++++++
 doc/ex3.md               | 266 ++++++++++++++++++
 doc/ex4.md               | 473 ++++++++++++++++++++++++++++++++
 doc/ex5.md               | 404 +++++++++++++++++++++++++++
 doc/ex6.md               | 440 ++++++++++++++++++++++++++++++
 doc/ex7.md               | 314 +++++++++++++++++++++
 doc/ex8.md               | 411 ++++++++++++++++++++++++++++
 doc/examples.md          |  24 +-
 doc/groups.dox           |  17 +-
 13 files changed, 3189 insertions(+), 22 deletions(-)
 create mode 100644 doc/ex1.md
 create mode 100644 doc/ex2.md
 create mode 100644 doc/ex3.md
 create mode 100644 doc/ex4.md
 create mode 100644 doc/ex5.md
 create mode 100644 doc/ex6.md
 create mode 100644 doc/ex7.md
 create mode 100644 doc/ex8.md
diff --git a/c++/nda/lapack/getrf.hpp b/c++/nda/lapack/getrf.hpp
index 32cfc24ae..b2d6c1b67 100644
--- a/c++/nda/lapack/getrf.hpp
+++ b/c++/nda/lapack/getrf.hpp
@@ -53,7 +53,7 @@ namespace nda::lapack {
    * @param a Input/output matrix. On entry, the m-by-n matrix to be factored. On exit, the factors \f$ \mathbf{L} \f$
    * and \f$ \mathbf{U} \f$ from the factorization \f$ \mathbf{A} = \mathbf{P L U} \f$; the unit diagonal elements of
    * \f$ \mathbf{L} \f$ are not stored.
-   * @param ipiv Output vector. The pivot indices from `getrf`, i.e. for `1 <= i <= N`, row i of the matrix was
+   * @param ipiv Output vector. The pivot indices from `getrf`, i.e. for `1 <= i <= n`, row i of the matrix was
    * interchanged with row `ipiv(i)`.
    * @return Integer return code from the LAPACK call.
    */
diff --git a/c++/nda/lapack/getrs.hpp b/c++/nda/lapack/getrs.hpp
index 626d23f9b..dfb62fe2d 100644
--- a/c++/nda/lapack/getrs.hpp
+++ b/c++/nda/lapack/getrs.hpp
@@ -50,12 +50,12 @@ namespace nda::lapack {
    *
    * @tparam A nda::MemoryMatrix type.
    * @tparam B nda::MemoryMatrix type.
-   * @tparam B nda::MemoryVector type.
+   * @tparam IPIV nda::MemoryVector type.
    * @param a Input matrix. The factors \f$ \mathbf{L} \f$ and \f$ \mathbf{U} \f$ from the factorization \f$ \mathbf{A}
    * = \mathbf{P L U} \f$ as computed by `getrf`.
    * @param b Input/output matrix. On entry, the right hand side matrix \f$ \mathbf{B} \f$. On exit, the solution matrix
    * \f$ \mathbf{X} \f$.
-   * @param ipiv Input vector. The pivot indices from `getrf`, i.e. for `1 <= i <= N`, row i of the matrix was
+   * @param ipiv Input vector. The pivot indices from `getrf`, i.e. for `1 <= i <= n`, row i of the matrix was
    * interchanged with row `ipiv(i)`.
    * @return Integer return code from the LAPACK call.
    */
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 6831b411a..bc28277a8 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -17,6 +17,14 @@
       </tab>
     </tab>
     <tab type="usergroup" url="@ref examples" title="Examples">
+      <tab type="user" url="@ref ex1" title="Example 1: A quick overview"/>
+      <tab type="user" url="@ref ex2" title="Example 2: Constructing arrays"/>
+      <tab type="user" url="@ref ex3" title="Example 3: Initializing arrays"/>
+      <tab type="user" url="@ref ex4" title="Example 4: Views and slices"/>
+      <tab type="user" url="@ref ex5" title="Example 5: HDF5 support"/>
+      <tab type="user" url="@ref ex6" title="Example 6: MPI support"/>
+      <tab type="user" url="@ref ex7" title="Example 7: Making use of symmetries"/>
+      <tab type="user" url="@ref ex8" title="Example 8: Linear algebra support"/>
     </tab>
     <tab type="usergroup" url="@ref documentation" title="API Documentation">
       <tab type="usergroup" url="@ref arrays_views" title="Arrays and views">
@@ -131,15 +139,10 @@
       <tab type="usergroup" url="@ref layout" title="Memory layout">
         <tab type="usergroup" url="@ref layout_pols" title="Layout policies">
           <tab type="user" url="@ref nda::basic_layout" title="basic_layout"/>
-          <tab type="user" url="@ref nda::basic_layout_str" title="basic_layout_str"/>
           <tab type="user" url="@ref nda::C_layout" title="C_layout"/>
-          <tab type="user" url="@ref nda::C_layout_str" title="C_layout_str"/>
           <tab type="user" url="@ref nda::C_stride_layout" title="C_stride_layout"/>
-          <tab type="user" url="@ref nda::C_stride_layout_str" title="C_stride_layout_str"/>
           <tab type="user" url="@ref nda::F_layout" title="F_layout"/>
-          <tab type="user" url="@ref nda::F_layout_str" title="F_layout_str"/>
           <tab type="user" url="@ref nda::F_stride_layout" title="F_stride_layout"/>
-          <tab type="user" url="@ref nda::F_stride_layout_str" title="F_stride_layout_str"/>
         </tab>
         <tab type="usergroup" url="@ref layout_utils" title="Layout utilities">
           <tab type="user" url="@ref nda::_linear_index_t" title="_linear_index_t"/>
@@ -149,7 +152,6 @@
         </tab>
         <tab type="usergroup" url="@ref layout_idx" title="Mult-dimensional indexing">
           <tab type="user" url="@ref nda::idx_map" title="idx_map"/>
-          <tab type="user" url="@ref nda::rect_str" title="rect_str"/>
         </tab>
       </tab>
       <tab type="usergroup" url="@ref memory" title="Memory management">
diff --git a/doc/ex1.md b/doc/ex1.md
new file mode 100644
index 000000000..68e4f18cd
--- /dev/null
+++ b/doc/ex1.md
@@ -0,0 +1,574 @@
+@page ex1 Example 1: A quick overview
+
+[TOC]
+
+In this example, we give a quick overview of the basic capabilities of **nda**.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <nda/h5.hpp>
+#include <h5/h5.hpp>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // code snippets go here ...
+}
+```
+
+@subsection ex1_p1 Creating and initializing an array
+
+In its simplest form, an nda::array has 2 template parameters
+
+- its value type and
+- its rank or number of dimensions.
+
+The following creates an integer (`int`) array of rank 2 and initializes its elements:
+
+```cpp
+// create a 3x2 integer array and initialize it
+auto A = nda::array<int, 2>(3, 2);
+for (int i = 0; auto &x : A) x = i++;
+```
+
+The first line calls the constructor with the intended shape of the array, i.e. the extents along the two dimensions.
+In this case, we want it to be 3-by-2.
+
+In the second line, we use a range-based for loop (note the reference `auto &`) to assign the first few positive integer
+numbers to the elements of the array.
+
+We could have achieved the same using the constructor which takes `std::initializer_list` objects:
+
+```cpp
+// create a 3x2 integer array using initializer lists
+auto A2 = nda::array<int, 2>{{0, 1}, {2, 3}, {4, 5}};
+```
+
+@subsection ex1_p2 Choosing a memory layout
+
+By default, nda::array stores its elements in C-order. To create an array in Fortran-order, we can specify a third
+template parameter:
+
+```cpp
+// create a 3x2 integer array in Fortran order and initialize it
+auto B = nda::array<int, 2, nda::F_layout>(3, 2);
+for (int i = 0; auto &x : B) x = i++;
+```
+
+Here, nda::F_layout is one of the @ref layout_pols.
+
+While in 2-dimensions, the only possibilities are C-order or Fortran-order, in higher dimensions one can also specify
+other stride orders (see nda::basic_layout and nda::basic_layout_str).
+
+@subsection ex1_p3 Printing an array
+
+Let's check the contents, sizes and shapes of the arrays using the overloaded streaming operators:
+
+```cpp
+// print the formatted arrays, its sizes and its shapes
+std::cout << "A = " << A << std::endl;
+std::cout << "A.size() = " << A.size() << std::endl;
+std::cout << "A.shape() = " << A.shape() << std::endl;
+std::cout << std::endl;
+std::cout << "B = " << B << std::endl;
+std::cout << "B.size() = " << B.size() << std::endl;
+std::cout << "B.shape() = " << B.shape() << std::endl;
+```
+
+Output:
+
+```
+A =
+[[0,1]
+ [2,3]
+ [4,5]]
+A.size() = 6
+A.shape() = (3 2)
+B =
+[[0,3]
+ [1,4]
+ [2,5]]
+B.size() = 6
+B.shape() = (3 2)
+```
+You can see the difference between the memory layouts of the array:
+
+- nda::C_layout iterates first over the second dimension while
+- nda::F_layout iterates first over the first dimension.
+
+> **Note**: The reason why we see a difference in the two arrays is because the range-based for loop is optimized to
+> iterate over contiguous memory whenever possible. If we would have used traditional for loops instead,
+> ```cpp
+> int n = 0;
+> for (int i = 0; i < 3; ++i) {
+>   for (int j = 0; j < 2; ++j) {
+>     B(i, j) = n++;
+>   }
+> }
+> ```
+> there would be no difference between the output of `A` and `B`:
+> ```
+> B =
+> [[0,1]
+>  [2,3]
+>  [4,5]]
+> ```
+
+@subsection ex1_p4 Accessing single elements
+
+We can access single elements of the array using the function call operator of the array object.
+For a 2-dimensional array, we have to pass exactly two indices, otherwise it won't compile:
+
+```cpp
+// access and assign to a single element
+A(2, 1) = 100;
+B(2, 1) = A(2, 1);
+std::cout << "A = " << A << std::endl;
+std::cout << "B = " << B << std::endl;
+std::cout << "B(2, 1) = " << B(2, 1) << std::endl;
+```
+
+Output:
+
+```
+A =
+[[0,1]
+ [2,3]
+ [4,100]]
+B =
+[[0,3]
+ [1,4]
+ [2,100]]
+B(2, 1) = 100
+```
+
+> **Note**: Out-of-bounds checks are only enabled in debug mode. The following code will compile and result in undefined
+> behavior:
+> ```cpp
+> std::cout << "A(3, 2) = " << A(3, 2) << std::endl;
+> ```
+> Output (might be different on other machines):
+> ```
+> A(3, 2) = 9
+> ```
+
+@subsection ex1_p5 Assigning to an array
+
+It is straightforward to assign a scalar or another array to an existing array:
+
+```cpp
+// assigning a scalar to an array
+A = 42;
+std::cout << "A = " << A << std::endl;
+
+// assigning an array to another array
+A = B;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[42,42]
+ [42,42]
+ [42,42]]
+A =
+[[0,3]
+ [1,4]
+ [2,100]]
+```
+
+@subsection ex1_p6 Working with views
+
+Views offer a lightweight and efficient way to manipulate and operate on existing arrays since they do not own their
+data, i.e. there is no memory allocation or copying involved when creating a view (see nda::basic_array_view).
+
+Taking a view on an existing array can be done with an empty function call:
+
+```cpp
+// create a view on A
+auto A_v = A();
+std::cout << "A_v =" << A_v << std::endl;
+std::cout << "A_v.size() = " << A_v.size() << std::endl;
+std::cout << "A_v.shape() = " << A_v.shape() << std::endl;
+```
+
+Output:
+
+```
+A_v =
+[[0,3]
+ [1,4]
+ [2,100]]
+A_v.size() = 6
+A_v.shape() = (3 2)
+```
+
+Here, `A_v` has the same data, size and shape as `A`.
+Since `A_v` is a view, it points to the same memory location as `A`.
+That means if we change something in `A_v`, we will also see the changes in `A`:
+
+```cpp
+// manipulate the data in the view
+A_v(0, 1) = -12;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[0,-12]
+ [1,4]
+ [2,100]]
+```
+
+In most cases, views will just behave like arrays and the majority of functions and operations that can be performed
+with arrays, also work with views.
+
+@subsection ex1_p7 Working with slices
+
+A slice is a view on only some parts of an existing array.
+
+To take a slice of an array, one uses again the function call operator but this time with one or more `nda::range`,
+`nda::range::all` or nda::ellipsis objects:
+
+- `nda::range` is similar to a Python [range](https://docs.python.org/3/library/stdtypes.html#range) and specifies which
+elements should be included in the slice along a certain dimension.
+- `nda::range::all` is similar to Python's `:` syntax and specifies that all elements along a certain dimension should
+be included in the slice.
+- nda::ellipsis is similar to Python's `...` syntax and specifies that all "missing" dimensions should be included in
+the slice. One can achieve the same with multiple `nda::range::all` objects.
+
+For example, we can view only the second row of `A`:
+
+```cpp
+// create a slice on A (a view on its second row)
+auto A_s = A(1, nda::range::all);
+std::cout << "A_s = " << A_s << std::endl;
+std::cout << "A_s.size() = " << A_s.size() << std::endl;
+std::cout << "A_s.shape() = " << A_s.shape() << std::endl;
+```
+
+Output:
+
+```
+A_s = [1,4]
+A_s.size() = 2
+A_s.shape() = (2)
+```
+
+As seen in this example, slices usually have a different size and shape compared to the original array.
+In this case, it is a 1-dimensional view of size 2.
+
+Since a slice is just a view, everything mentioned above about views still applies:
+
+```cpp
+// assign to a slice
+A_s = nda::array<int, 1>{-1, -4};
+std::cout << "A_s = " << A_s << std::endl;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A_s = [-1,-4]
+A =
+[[0,-12]
+ [-1,-4]
+ [2,100]]
+```
+
+@subsection ex1_p8 Performing arithmetic operations
+
+We can perform various @ref av_ops with arrays and views.
+Arithmetic operations are (mostly) implemented as lazy expressions. That means the operations are not performed right
+away but only when needed at a later time.
+
+For example, the following tries to add two 2-dimensional arrays element-wise:
+
+```cpp
+// add two arrays element-wise
+auto C = nda::array<int, 2>({{1, 2}, {3, 4}});
+auto D = nda::array<int, 2>({{5, 6}, {7, 8}});
+std::cout << C + D << std::endl;
+```
+
+Output:
+
+```
+(
+[[1,2]
+ [3,4]] +
+[[5,6]
+ [7,8]])
+```
+
+Note that `C + D` does not return the result of the addition in the form of a new array object, but instead it returns
+an nda::expr object that describes the operation it is about to do.
+
+To turn a lazy expression into an array/view, we can either assign it to an existing array/view, construct a new array
+or use nda::make_regular:
+
+```cpp
+// evaluate the lazy expression by assigning it to an existing array
+auto E = nda::array<int, 2>(2, 2);
+E = C + D;
+std::cout << "E = " << E << std::endl;
+
+// evaluate the lazy expression by constructing a new array
+std::cout << "nda::array<int, 2>{C + D} = " << nda::array<int, 2>{C + D} << std::endl;
+
+// evaluate the lazy expression using nda::make_regular
+std::cout << "nda::make_regular(C + D) = " << nda::make_regular(C + D) << std::endl;
+```
+
+Output:
+
+```
+E =
+[[6,8]
+ [10,12]]
+nda::array<int, 2>{C + D} =
+[[6,8]
+ [10,12]]
+nda::make_regular(C + D) =
+[[6,8]
+ [10,12]]
+```
+
+The behavior of arithmetic operations (and some other functions) depends on the algebra (see nda::get_algebra) of its
+operands.
+
+For example, consider matrix-matrix multiplication (``'M'`` algebra) vs. element-wise multiplication of two arrays
+(``'A'`` algebra):
+
+```cpp
+// multiply two arrays elementwise
+std::cout << "C * D =" << nda::array<int, 2>(C * D) << std::endl;
+
+// multiply two matrices
+auto M1 = nda::matrix<int>({{1, 2}, {3, 4}});
+auto M2 = nda::matrix<int>({{5, 6}, {7, 8}});
+std::cout << "M1 * M2 =" << nda::matrix<int>(M1 * M2) << std::endl;
+```
+
+Output:
+
+```
+C * D =
+[[5,12]
+ [21,32]]
+M1 * M2 =
+[[19,22]
+ [43,50]]
+```
+
+Here, an nda::matrix is the same as an nda::array of rank 2, except that it belongs to the ``'M'`` algebra instead of
+the ``'A'`` algebra.
+
+@subsection ex1_p9 Applying mathematical functions and algorithms
+
+Similar to arithmetic operations, most of the @ref av_math that can operate on arrays and views return a lazy function
+call expression (nda::expr_call) and are sensitive to their algebras:
+
+```cpp
+// element-wise square
+nda::array<double, 2> C_sq = nda::pow(C, 2);
+std::cout << "C^2 =" << C_sq << std::endl;
+
+// element-wise square root
+nda::array<double, 2> C_sq_sqrt = nda::sqrt(C_sq);
+std::cout << "sqrt(C^2) =" << C_sq_sqrt << std::endl;
+
+// trace of a matrix
+std::cout << "trace(M1) = " << nda::trace(M1) << std::endl;
+```
+
+Output:
+
+```
+C^2 =
+[[1,4]
+ [9,16]]
+sqrt(C^2) =
+[[1,2]
+ [3,4]]
+trace(M1) = 5
+```
+
+Besides arithmetic operations and mathematical functions, one can apply some basic @ref av_algs to arrays and views:
+
+```cpp
+// find the minimum/maximum element in an array
+std::cout << "min_element(C) =" << nda::min_element(C) << std::endl;
+std::cout << "max_element(C) =" << nda::max_element(C) << std::endl;
+
+// is any (are all) element(s) greater than 1?
+auto greater1 = nda::map([](int x) { return x > 1; })(C);
+std::cout << "any(C > 1) = " << nda::any(greater1) << std::endl;
+std::cout << "all(C > 1) = " << nda::all(greater1) << std::endl;
+
+// sum/multiply all elements in an array
+std::cout << "sum(C) = " << nda::sum(C) << std::endl;
+std::cout << "product(C) = " << nda::product(C) << std::endl;
+```
+
+Output:
+
+```
+min_element(C) = 1
+max_element(C) = 4
+any(C > 1) = 1
+all(C > 1) = 0
+sum(C) = 10
+product(C) = 24
+```
+
+Most of the algorithms in the above example are self-explanatory, except for the nda::any and nda::all calls.
+They expect the given array elements to have a `bool` value type which is not the case for the `C` array.
+We therefore use nda::map to create a lazy nda::expr_call that returns a `bool` upon calling it.
+Since nda::expr_call fulfills the nda::Array concept, it can be passed to the algorithms.
+
+@subsection ex1_p10 Writing/Reading HDF5
+
+Writing arrays and views to HDF5 files using **nda's** @ref av_hdf5 is as easy as
+
+```cpp
+// write an array to an HDF5 file
+h5::file out_file("ex1.h5", 'w');
+h5::write(out_file, "C", C);
+```
+
+Dumping the resulting HDF5 file gives
+
+```
+HDF5 "ex1.h5" {
+GROUP "/" {
+   DATASET "C" {
+      DATATYPE  H5T_STD_I32LE
+      DATASPACE  SIMPLE { ( 2, 2 ) / ( 2, 2 ) }
+      DATA {
+      (0,0): 1, 2,
+      (1,0): 3, 4
+      }
+   }
+}
+}
+```
+
+Reading the same file into a new array is just as simple:
+
+```cpp
+// read an array from an HDF5 file
+nda::array<int, 2> C_copy;
+h5::file in_file("ex1.h5", 'r');
+h5::read(in_file, "C", C_copy);
+std::cout << "C_copy = " << C_copy << std::endl;
+```
+
+Output:
+
+```
+C_copy =
+[[1,2]
+ [3,4]]
+```
+
+@subsection ex1_p11 Doing linear algebra with arrays
+
+**nda** has a @ref linalg_blas and an @ref linalg_lapack and provides the nda::matrix and nda::vector types.
+While nda::matrix is a 2-dimensional array belonging to the ``'M'`` algebra, nda::vector is a 1-dimensional array
+belonging to the ``'V'`` algebra.
+Under the hood, both of them are just type aliases of the general nda::basic_array class.
+Everything mentioned above for nda::array objects is therefore also true for matrices and vectors, e.g. writing/reading
+to HDF5, accessing single elements, taking slices, etc.
+
+Let us demonstrate some of the supported @ref linalg features:
+
+```cpp
+// create a 2x2 matrix
+auto M3 = nda::matrix<double>{{1, 2}, {3, 4}};
+
+// get the inverse of the matrix (calls LAPACK routines)
+auto M3_inv = nda::inverse(M3);
+std::cout << "M3_inv = " << M3_inv << std::endl;
+
+// get the inverse of a matrix manually using its adjugate and determinant
+auto M3_adj = nda::matrix<double>{{4, -2}, {-3, 1}};
+auto M3_det = nda::determinant(M3);
+auto M3_inv2 = nda::matrix<double>(M3_adj / M3_det);
+std::cout << "M3_inv2 = " << M3_inv2 << std::endl;
+
+// check the inverse using matrix-matrix multiplication
+std::cout << "M3 * M3_inv = " << M3 * M3_inv << std::endl;
+std::cout << "M3_inv * M3 = " << M3_inv * M3 << std::endl;
+
+// solve a linear system using the inverse
+auto b = nda::vector<double>{5, 8};
+auto x = M3_inv * b;
+std::cout << "M3 * x = " << M3 * x << std::endl;
+```
+
+Output:
+
+```
+M3_inv =
+[[-2,1]
+ [1.5,-0.5]]
+M3_inv2 =
+[[-2,1]
+ [1.5,-0.5]]
+M3 * M3_inv =
+[[1,0]
+ [0,1]]
+M3_inv * M3 =
+[[1,0]
+ [0,1]]
+M3 * x = [5,8]
+```
+
+> **Note**: Matrix-matrix and matrix-vector multiplication do not return lazy expressions, since they call the
+> corresponding BLAS routines directly, while element-wise array-array multiplication does return a lazy expression.
+
+@subsection ex1_p12 Initializing with CLEF's automatic assignment
+
+**nda** contains the @ref clef (CLEF) library which is a more or less standalone implementation of general lazy
+expressions.
+
+One of the most useful tools of CLEF is its @ref clef_autoassign feature:
+
+```cpp
+// initialize a 4x7 array using CLEF automatic assignment
+using namespace nda::clef::literals;
+auto F = nda::array<int, 2>(4, 7);
+F(i_, j_) << i_ * 7 + j_;
+std::cout << "F = " << F << std::endl;
+```
+
+Output:
+
+```
+F =
+[[0,1,2,3,4,5,6]
+ [7,8,9,10,11,12,13]
+ [14,15,16,17,18,19,20]
+ [21,22,23,24,25,26,27]]
+```
+
+Here, we first construct an uninitialized 4-by-7 array before we use automatic assignment to initialize the array.
+This is done with the overloaded `operator<<` and nda::clef::placeholder objects (`i_` and `j_` in the above example).
+It will simply loop over all possible multidimensional indices of the array, substitute them for the placeholders and
+assign the result to the corresponding array element, e.g. `F(3, 4) = 3 * 7 + 4 = 25`.
+
+This is especially helpful for high-dimensional arrays where the element at `(i, j, ..., k)` can be written as some
+function \f$ g \f$ of its indices, i.e. \f$ F_{ij \dots k} = g(i, j, \dots, k) \f$.
+
+@subsection ex1_p13 Further examples
+
+The above features constitute only a fraction of what you can do with **nda**.
+
+For more details, please look at the other @ref examples and the @ref documentation.
diff --git a/doc/ex2.md b/doc/ex2.md
new file mode 100644
index 000000000..9bd7a0ed8
--- /dev/null
+++ b/doc/ex2.md
@@ -0,0 +1,268 @@
+@page ex2 Example 2: Constructing arrays
+
+[TOC]
+
+In this example, we show how to construct arrays in different ways.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <complex>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // code snippets go here ...
+}
+```
+
+@subsection ex2_p1 Default constructor
+
+The default constructor creates an empty array of size 0:
+
+```cpp
+// default constructor
+auto A1 = nda::array<double, 2>();
+std::cout << "A1 = " << A1 << std::endl;
+std::cout << "A1.size() = " << A1.size() << std::endl;
+std::cout << "A1.shape() = " << A1.shape() << std::endl;
+```
+
+Output:
+
+```
+A =
+[]
+A.size() = 0
+A.shape() = (0 0)
+```
+
+Since the size of the array is zero, no memory has been allocated.
+To be able to use the array in a productive way, one has to resize it.
+This can be done either
+- directly via the free nda::resize_or_check_if_view function,
+- the nda::basic_array::resize member or
+- by assigning another array/view to it which indirectly will call nda::basic_array::resize:
+
+```cpp
+// resize the array using the resize method
+A.resize(3, 2);
+std::cout << "A.size() = " << A.size() << std::endl;
+std::cout << "A.shape() = " << A.shape() << std::endl;
+
+// resize using assignment
+A = nda::array<double, 2>(10, 10);
+std::cout << "A.size() = " << A.size() << std::endl;
+std::cout << "A.shape() = " << A.shape() << std::endl;
+```
+
+Output:
+
+```
+A.size() = 6
+A.shape() = (3 2)
+A.size() = 100
+A.shape() = (10 10)
+```
+
+@subsection ex2_p2 Constructing an array with a given shape
+
+The usual way to create an array is by specifying its shape.
+While the shape is a runtime parameter, the rank of the array still has to be known at compile-time (it is a template
+parameter):
+
+```cpp
+// create a 100x100 complex matrix
+auto M1 = nda::matrix<std::complex<double>>(100, 100);
+std::cout << "M1.size() = " << M1.size() << std::endl;
+std::cout << "M1.shape() = " << M1.shape() << std::endl;
+```
+
+Output:
+
+```
+M1.size() = 10000
+M1.shape() = (100 100)
+```
+
+While for higher dimensional arrays the elements are in general left uninitialized, it is possible to construct an
+1-dimensional array with a given size and initialize its elements to a constant value:
+
+```cpp
+// create a 1-dimensional vector of size 5 with the value 10 + 1i
+using namespace std::complex_literals;
+auto v1 = nda::vector<std::complex<double>>(5, 10 + 1i);
+std::cout << "v1 = " << v1 << std::endl;
+std::cout << "v1.size() = " << v1.size() << std::endl;
+std::cout << "v1.shape() = " << v1.shape() << std::endl;
+```
+
+Output:
+
+```
+v1 = [(10,1),(10,1),(10,1),(10,1),(10,1)]
+v1.size() = 5
+v1.shape() = (5)
+```
+
+@subsection ex2_p3 Copy/Move constructors
+
+The copy and move constructors behave as expected:
+
+- the copy constructor simply copies the memory handle and layout of an array and
+- the move constructor moves them:
+
+```cpp
+// copy constructor
+auto v2 = v1;
+std::cout << "v2 = " << v2 << std::endl;
+std::cout << "v2.size() = " << v2.size() << std::endl;
+std::cout << "v2.shape() = " << v2.shape() << std::endl;
+
+// move constructor
+auto v3 = std::move(v2);
+std::cout << "v3 = " << v3 << std::endl;
+std::cout << "v3.size() = " << v3.size() << std::endl;
+std::cout << "v3.shape() = " << v3.shape() << std::endl;
+std::cout << "v2.empty() = " << v2.empty() << std::endl;
+```
+
+Output:
+
+```
+v2 = [(10,1),(10,1),(10,1),(10,1),(10,1)]
+v2.size() = 5
+v2.shape() = (5)
+v3 = [(10,1),(10,1),(10,1),(10,1),(10,1)]
+v3.size() = 5
+v3.shape() = (5)
+v2.empty() = 1
+```
+
+> **Note**: After moving, the vector `v2` is empty, i.e. its memory handle does not manage any memory at the moment. To
+> use it, one should again resize it, so that new memory is allocated.
+
+@subsection ex2_p4 Constructing an array from its data
+
+1-, 2- and 3-dimensional arrays can be constructed directly from their data using `std::initializer_list` objects:
+- a 1-dimensional array is constructed from a single list,
+- a 2-dimensional array is constructed from a nested list (a list of lists) and
+- a 3-dimensional array is constructed from a double nested list (a list of lists of lists).
+
+```cpp
+// 1-dimensional array from a std::initializer_list
+auto A1 = nda::array<int, 1>{1, 2, 3, 4, 5};
+std::cout << "A1 = " << A1 << std::endl;
+std::cout << "A1.size() = " << A1.size() << std::endl;
+std::cout << "A1.shape() = " << A1.shape() << std::endl;
+
+// 2-dimensional array from a std::initializer_list
+auto A2 = nda::array<int, 2>{{1, 2}, {3, 4}, {5, 6}};
+std::cout << "A2 = " << A2 << std::endl;
+std::cout << "A2.size() = " << A2.size() << std::endl;
+std::cout << "A2.shape() = " << A2.shape() << std::endl;
+
+// 3-dimensional array from a std::initializer_list
+auto A3 = nda::array<int, 3>{{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}};
+std::cout << "A3 = " << A3 << std::endl;
+std::cout << "A3.size() = " << A3.size() << std::endl;
+std::cout << "A3.shape() = " << A3.shape() << std::endl;
+```
+
+Output:
+
+```
+A1 = [1,2,3,4,5]
+A1.size() = 5
+A1.shape() = (5)
+A2 =
+[[1,2]
+ [3,4]
+ [5,6]]
+A2.size() = 6
+A2.shape() = (3 2)
+A3 = [1,2,3,4,5,6,7,8,9,10,11,12]
+A3.size() = 12
+A3.shape() = (2 3 2)
+```
+
+@subsection ex2_p5 Constructing an array from an nda::Array
+
+We can also construct an array from any object that satisfies the nda::Array concept, has a compatible value type and
+the same rank.
+
+For example, this could be a lazy expression or another array/view with a possibly different
+- value type,
+- memory layout (i.e. stride order),
+- algebra and/or
+- container policy (e.g. construct an array on the GPU from an array on the host).
+
+```cpp
+// construct an array from a lazy expression
+nda::array<int, 1> A1_sum = A1 + A1;
+std::cout << "A1_sum = " << A1_sum << std::endl;
+std::cout << "A1_sum.size() = " << A1_sum.size() << std::endl;
+std::cout << "A1_sum.shape() = " << A1_sum.shape() << std::endl;
+
+// construct an array from another array with a different memory layout
+nda::array<double, 2, nda::F_layout> A2_f(A2);
+std::cout << "A2_f = " << A2_f << std::endl;
+std::cout << "A2_f.size() = " << A2_f.size() << std::endl;
+std::cout << "A2_f.shape() = " << A2_f.shape() << std::endl;
+```
+
+Output:
+
+```
+A1_sum = [2,4,6,8,10]
+A1_sum.size() = 5
+A1_sum.shape() = (5)
+A2_f =
+[[1,2]
+ [3,4]
+ [5,6]]
+A2_f.size() = 6
+A2_f.shape() = (3 2)
+```
+
+@subsection ex2_p6 Factories and transformations
+
+**nda** provides various factory functions and transformations that allow us to construct special arrays very easily
+either from scratch or from some other input arrays.
+
+Here, we only give a few examples and refer the interested reader to the @ref av_factories documentation.
+
+```cpp
+// 1-dimensional array with only even integers from 2 to 20
+auto v4 = nda::arange(2, 20, 2);
+std::cout << "v4 = " << v4 << std::endl;
+
+// 3x3 identity matrix
+auto I = nda::eye<double>(3);
+std::cout << "I = " << I << std::endl;
+
+// 2x2x2 array with random numbers from 0 to 1
+auto R = nda::rand(2, 2, 2);
+std::cout << "R = " << R << std::endl;
+std::cout << "R.shape() = " << R.shape() << std::endl;
+
+// 3x6 array with zeros
+auto Z = nda::zeros<double>(3, 6);
+std::cout << "Z = " << Z << std::endl;
+```
+
+Output:
+
+```
+v4 = [2,4,6,8,10,12,14,16,18]
+I =
+[[1,0,0]
+ [0,1,0]
+ [0,0,1]]
+R = [0.349744,0.226507,0.444232,0.229969,0.370864,0.440588,0.607665,0.754914]
+R.shape() = (2 2 2)
+Z =
+[[0,0,0,0,0,0]
+ [0,0,0,0,0,0]
+ [0,0,0,0,0,0]]
+```
diff --git a/doc/ex3.md b/doc/ex3.md
new file mode 100644
index 000000000..06b4c4f8a
--- /dev/null
+++ b/doc/ex3.md
@@ -0,0 +1,266 @@
+@page ex3 Example 3: Initializing arrays
+
+[TOC]
+
+In this example, we show how to initialize arrays and assign to them in different ways.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <complex>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // code snippets go here ...
+}
+```
+
+@subsection ex3_p1 Assigning a scalar to an array
+
+The simplest way to initialize an array is to assign a scalar to it:
+
+```cpp
+// assign a scalar to an array
+auto A = nda::array<std::complex<double>, 2>(3, 2);
+A = 0.1 + 0.2i;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[(0.1,0.2),(0.1,0.2)]
+ [(0.1,0.2),(0.1,0.2)]
+ [(0.1,0.2),(0.1,0.2)]]
+```
+
+Note that the behavior of the assignment depends on the algebra of the array.
+
+While the scalar is assigned to all elements of an array, for matrices it is only assigned to the elements on the
+shorter diagonal:
+
+```cpp
+// assign a scalar to a matrix
+auto M = nda::matrix<std::complex<double>>(3, 2);
+M = 0.1 + 0.2i;
+std::cout << "M = " << M << std::endl;
+```
+
+Output:
+
+```
+M =
+[[(0.1,0.2),(0,0)]
+ [(0,0),(0.1,0.2)]
+ [(0,0),(0,0)]]
+```
+
+The scalar type can also be more complex, e.g. another nda::array:
+
+```cpp
+// assign an array to an array
+auto A_arr = nda::array<nda::array<int, 1>, 1>(4);
+A_arr = nda::array<int, 1>{1, 2, 3};
+std::cout << "A_arr = " << A_arr << std::endl;
+```
+
+Output:
+
+```
+A_arr = [[1,2,3],[1,2,3],[1,2,3],[1,2,3]]
+```
+
+@subsection ex3_p2 Copy/Move assignment
+
+The copy and move assignment operations behave as expected:
+
+- copy assignment simply copies the memory handle and layout of an array and
+- move assignment moves them:
+
+```cpp
+// copy assignment
+auto M_copy = nda::matrix<std::complex<double>>(3, 2);
+M_copy = M;
+std::cout << "M_copy = " << M_copy << std::endl;
+
+// move assignment
+auto M_copy = nda::matrix<std::complex<double>>();
+M_move = std::move(M_copy);
+std::cout << "M_move = " << M_move << std::endl;
+std::cout << "M_copy.empty() = " << M_copy.empty() << std::endl;
+```
+
+Output:
+
+```
+M_copy =
+[[(0.1,0.2),(0,0)]
+ [(0,0),(0.1,0.2)]
+ [(0,0),(0,0)]]
+M_move =
+[[(0.1,0.2),(0,0)]
+ [(0,0),(0.1,0.2)]
+ [(0,0),(0,0)]]
+M_copy.empty() = 1
+```
+
+> **Note**: Be careful when reusing an object after it has been moved (see the note at @ref ex2_p3)!
+
+@subsection ex3_p3 Assigning an nda::Array to an array
+
+To assign an object that satisfies the nda::Array concept to an array is similar to @ref ex2_p5.
+
+```cpp
+// assign a lazy expression
+nda::matrix<std::complex<double>, nda::F_layout> M_f(3, 2);
+M_f = M + M;
+std::cout << "M_f = " << M_f << std::endl;
+
+// assign another array with a different layout and algebra
+nda::array<std::complex<double>, 2> A2;
+A2 = M_f;
+std::cout << "A2 = " << A2 << std::endl;
+```
+
+Output:
+
+```
+M_f =
+[[(0.2,0.4),(0,0)]
+ [(0,0),(0.2,0.4)]
+ [(0,0),(0,0)]]
+A2 =
+[[(0.2,0.4),(0,0)]
+ [(0,0),(0.2,0.4)]
+ [(0,0),(0,0)]]
+```
+
+Many assignment operations resize the left hand side array if necessary.
+
+In the above example, the first assignment does not need to do a resize because the left hand side array already has the
+correct shape.
+
+This is not true for the second assignment.
+`A2` has been default constructed and therefore has a size of 0.
+
+@subsection ex3_p4 Assigning a contiguous range
+
+It is possible to assign an object that satisfies the `std::ranges::contiguous_range` concept to an 1-dimensional array:
+
+```cpp
+// assign a contiguous range to an 1-dimensional array
+std::vector<long> vec{1, 2, 3, 4, 5};
+auto A_vec = nda::array<long, 1>();
+A_vec = vec;
+std::cout << "A_vec = " << A_vec << std::endl;
+```
+
+Output:
+
+```
+A_vec = [1,2,3,4,5]
+```
+
+As expected, the elements of the range are simply copied into the array.
+
+@subsection ex3_p5 Initializing an array manually
+
+We can also initialize an array by assigning to each element manually.
+This can be done in different ways.
+
+For example,
+
+- using traditional for-loops:
+
+  ```cpp
+  // initialize an array using traditional for-loops
+  auto B = nda::array<int, 2>(2, 3);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      B(i, j) = i * 3 + j;
+    }
+  }
+  std::cout << "B = " << B << std::endl;
+  ```
+
+  Output:
+
+  ```
+  B =
+  [[0,1,2]
+   [3,4,5]]
+  ```
+
+- using a range-based for-loop:
+
+  ```cpp
+  // initialize an array using a range-based for-loop
+  auto B2 = nda::array<int, 2>(2, 3);
+  for (int i = 0; auto &x : B2) x = i++;
+  std::cout << "B2 = " << B2 << std::endl;
+  ```
+
+  Output:
+
+  ```
+  B2 =
+  [[0,1,2]
+   [3,4,5]]
+  ```
+
+- using nda::for_each:
+
+  ```cpp
+  // initialize an array using nda::for_each
+  auto B3 = nda::array<int, 2>(2, 3);
+  nda::for_each(B3.shape(), [&B3](auto i, auto j) { B3(i, j) = i * 3 + j; });
+  std::cout << "B3 = " << B3 << std::endl;
+  ```
+
+  Output:
+
+  ```
+  B3 =
+  [[0,1,2]
+   [3,4,5]]
+  ```
+
+While the traditional for-loops are perhaps the most flexible option, it becomes tedious quite fast with increasing
+dimensionality.
+
+@subsection ex3_p6 Initializing an array using automatic assignment
+
+This has already been explained in @ref ex1_p12.
+
+Let us give another example:
+
+```cpp
+// initialize an array using CLEF's automatic assignment
+using namespace nda::clef::literals;
+auto C = nda::array<int, 4>(2, 2, 2, 2);
+C(i_, j_, k_, l_) << i_ * 8 + j_ * 4 + k_ * 2 + l_;
+std::cout << "C = " << C << std::endl;
+```
+
+Output:
+
+```
+C = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+```
+
+The advantage of CLEF's automatic assignment becomes obvious when we rewrite line 3 from above using traditional
+for-loops:
+
+```cpp
+for (int i = 0; i < 2; ++i) {
+  for (int j = 0; j < 2; ++j) {
+    for (int k = 0; k < 2; ++k) {
+      for (int l = 0; l < 2; ++l) {
+        C(i, j, k, l) = i * 8 + j * 4 + k * 2 + l;
+      }
+    }
+  }
+}
+```
diff --git a/doc/ex4.md b/doc/ex4.md
new file mode 100644
index 000000000..f75c12a79
--- /dev/null
+++ b/doc/ex4.md
@@ -0,0 +1,473 @@
+@page ex4 Example 4: Views and slices
+
+[TOC]
+
+In this example, we show how to construct and work with views and slices.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // code snippets go here ...
+}
+```
+
+@subsection ex4_p1 Creating a full view on an array/view
+
+We have already seen in @ref ex1_p6 how we can get a full view on an existing array by doing an empty function call:
+
+```cpp
+// create a full view on an array
+auto A = nda::array<int, 2>(5, 5);
+for (int i = 0; auto &x : A) x = i++;
+auto A_v = A();
+std::cout << "A_v = " << A_v << std::endl;
+```
+
+Output:
+
+```
+A_v =
+[[0,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+The same could have been achieved by
+
+- calling the member function nda::basic_array::as_array_view,
+- calling the free function nda::make_array_view or
+- taking a full slice of the array, e.g. `A(nda::ellipsis{})` or `A(nda::range::all, nda::range::all)`.
+
+Since views behave mostly as arrays, we can also take a view of a view:
+
+```cpp
+// create a view of a view
+auto A_vv = A_v();
+std::cout << "A_vv = " << A_vv << std::endl;
+```
+
+Output:
+
+```
+A_vv =
+[[0,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+@subsection ex4_p2 Value type of views
+
+While the value type of an array is always non-const, views can have const or non-const value types.
+
+Usually, views will have the same value type as the underlying array/view:
+
+```cpp
+// check the value type of the view and assign to it
+static_assert(std::is_same_v<decltype(A_v)::value_type, int>);
+A_v(0, 0) = -1;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[-1,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+This is not the case if we take a view of a const array/view, then the value type will be const:
+
+```cpp
+// taking a view of a const array
+auto A_vc = static_cast<const nda::array<int, 2>>(A)();
+static_assert(std::is_same_v<decltype(A_vc)::value_type, const int>);
+
+// taking a view of a view with a const value type
+auto A_vvc = A_vc();
+static_assert(std::is_same_v<decltype(A_vvc)::value_type, const int>);
+```
+
+As expected, we cannot assign to const arrays/views or views with a const value type, i.e.
+
+```cpp
+// this will not compile
+A_vc(0, 0) = -2;
+```
+
+@subsection ex4_p3 Creating a slice of an array/view
+
+@ref ex1_p7 has already explained what a slice is and how we can create one.
+In the following, we will give some more examples to show how slices can be used in practice.
+
+Here is the original array that we will be working on:
+
+```cpp
+// original array
+A(0, 0) = 0;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[0,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+Let us start by taking a slice of every other column:
+
+```cpp
+// slice containing every other column
+auto S_1 = A(nda::range::all, nda::range(0, 5, 2));
+std::cout << "S_1 = " << S_1 << std::endl;
+```
+
+Output:
+
+```
+S_1 =
+[[0,2,4]
+ [5,7,9]
+ [10,12,14]
+ [15,17,19]
+ [20,22,24]]
+```
+
+Now we take a slice of every other row of this slice:
+
+```cpp
+// slice containing every other row of S_1
+auto S_2 = S_1(nda::range(0, 5, 2), nda::range::all);
+std::cout << "S_2 = " << S_2 << std::endl;
+```
+
+Output:
+
+```
+S_2 =
+[[0,2,4]
+ [10,12,14]
+ [20,22,24]]
+```
+
+We could have gotten the same slice with
+
+```cpp
+// slice containing every other row and column
+auto S_3 = A(nda::range(0, 5, 2), nda::range(0, 5, 2));
+std::cout << "S_3 = " << S_3 << std::endl;
+```
+
+Output:
+
+```
+S_3 =
+[[0,2,4]
+ [10,12,14]
+ [20,22,24]]
+```
+
+@subsection ex4_p4 Assigning to views
+
+Before assigning to the view `S_3`, let's make a copy of its contents so that we can restore everything later on:
+
+```cpp
+// make a copy of the view using nda::make_regular
+auto A_tmp = nda::make_regular(S_3);
+```
+
+> nda::make_regular turns view, slices and lazy expressions into regular nda::basic_array objects.
+
+`A_tmp` is an nda::array of the same size as `S_3` and contains the same values but in a different memory location.
+
+Now, we can assign to the view just like to arrays (see @ref ex3):
+
+```cpp
+// assign a scalar to a view
+S_3 = 0;
+std::cout << "S_3 = " << S_3 << std::endl;
+```
+
+Output:
+
+```
+S_3 =
+[[0,0,0]
+ [0,0,0]
+ [0,0,0]]
+```
+
+Any changes that we make to a view will be reflected in the original array and all other views that are currently
+looking at the same memory locations:
+
+```cpp
+// check the changes to the original array and other views
+std::cout << "S_1 = " << S_1 << std::endl;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+S_1 =
+[[0,0,0]
+ [5,7,9]
+ [0,0,0]
+ [15,17,19]
+ [0,0,0]]
+A =
+[[0,1,0,3,0]
+ [5,6,7,8,9]
+ [0,11,0,13,0]
+ [15,16,17,18,19]
+ [0,21,0,23,0]]
+```
+
+To restore the original array, we can assign our copy from before:
+
+```cpp
+// assign an array to a view
+S_3 = A_tmp;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[0,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+> **Note**: In contrast to arrays, views cannot be resized. When assigning some general nda::Array object to a view,
+> their shapes have to match, otherwise this may result in undefined behavior.
+
+@subsection ex4_p5 Copy/Move operations
+
+The copy and move operations of views are a little bit different than their array counterparts:
+
+- The copy constructor makes a new view that points to the same data as the other view.
+- The move constructor does the same as the copy constructor.
+- The copy assignment operator makes a deep copy of the contents of the other view.
+- The move assignment operator does the same as the copy assignment.
+
+```cpp
+// copy construct a view
+auto S_3_copy = S_3;
+std::cout << "S_3.data() == S_3_copy.data() = " << (S_3.data() == S_3_copy.data()) << std::endl;
+
+// move construct a view
+auto S_3_move = std::move(S_3);
+std::cout << "S_3.data() == S_3_move.data() = " << (S_3.data() == S_3_move.data()) << std::endl;
+
+// copy assign to a view
+auto B = nda::array<int, 2>(S_3.shape());
+auto B_v = B();
+B_v = S_3;
+std::cout << "B = " << B << std::endl;
+
+// move assign to a view
+auto C = nda::array<int, 2>(S_3.shape());
+auto C_v = C();
+C_v = std::move(S_3);
+std::cout << "C = " << C << std::endl;
+```
+
+Output:
+
+```
+S_3.data() == S_3_copy.data() = 1
+S_3.data() == S_3_move.data() = 1
+B =
+[[0,2,4]
+ [10,12,14]
+ [20,22,24]]
+C =
+[[0,2,4]
+ [10,12,14]
+ [20,22,24]]
+```
+
+@subsection ex4_p6 Operating on views/slices
+
+We can perform various arithmetic operations, mathematical functions and algorithms with views and slices just like we
+did with arrays in @ref ex1_p8 and @ref ex1_p9.
+
+```cpp
+// arithmetic operations and math functions
+C_v = S_3 * 2 + nda::pow(S_3, 2);
+std::cout << "C = " << C << std::endl;
+
+// algorithms
+std::cout << "min_element(S_3) = " << nda::min_element(S_3) << std::endl;
+std::cout << "max_element(S_3) = " << nda::max_element(S_3) << std::endl;
+std::cout << "sum(S_3) = " << nda::sum(S_3) << std::endl;
+```
+
+Output:
+
+```
+C =
+[[0,8,24]
+ [120,168,224]
+ [440,528,624]]
+min_element(S_3) = 0
+max_element(S_3) = 24
+sum(S_3) = 108
+product(S_3) = 0
+```
+
+@subsection ex4_p7 Rebinding a view to another array/view
+
+If we want to bind an existing view to a new array/view/memory location, we cannot simply use the copy assignment (since
+it makes a deep copy of the view's contents). Instead we have to call nda::basic_array_view::rebind:
+
+```cpp
+// rebind a view
+std::cout << "S_3.data() == C_v.data() = " << (S_3.data() == C_v.data()) << std::endl;
+C_v.rebind(S_3);
+std::cout << "S_3.data() == C_v.data() = " << (S_3.data() == C_v.data()) << std::endl;
+```
+
+Output:
+
+```
+S_3.data() == C_v.data() = 0
+S_3.data() == C_v.data() = 1
+```
+
+@subsection ex4_p8 Viewing generic 1-dimensional ranges
+
+The views in **nda** can also view generic 1-dimensional ranges like `std::vector` or `std::array`.
+The only requirement is that they are contiguous:
+
+```cpp
+// take a view on a std::array
+std::array<double, 5> arr{1.0, 2.0, 3.0, 4.0, 5.0};
+auto arr_v = nda::basic_array_view(arr);
+std::cout << "arr_v = " << arr_v << std::endl;
+
+// change the value of the vector through the view
+arr_v *= 2.0;
+std::cout << "arr = " << arr << std::endl;
+```
+
+Output:
+
+```
+arr_v = [1,2,3,4,5]
+arr = (2 4 6 8 10)
+```
+
+@subsection ex4_p9 Factories and transformations
+
+@ref av_factories contain various functions to create new and transform existing views.
+
+In the following, we will show some examples.
+
+Let us start from this array:
+
+```cpp
+// original array
+auto D = nda::array<int, 2>(3, 4);
+for (int i = 0; auto &x : D) x = i++;
+std::cout << "D = " << D << std::endl;
+```
+
+Output:
+
+```
+D =
+[[0,1,2,3]
+ [4,5,6,7]
+ [8,9,10,11]]
+```
+
+We can transpose `D` using nda::transpose:
+
+```cpp
+// create a transposed view
+auto D_t = nda::transpose(D);
+std::cout << "D_t = " << D_t << std::endl;
+```
+
+Output:
+
+```
+D_t =
+[[0,4,8]
+ [1,5,9]
+ [2,6,10]
+ [3,7,11]]
+```
+
+> **Note**: `D_t` is a view and not an array. That means that transposing an existing array/view is a very cheap
+> operation since it doesn't allocate or copy any data.
+>
+> The same is true for most of the other transformations.
+
+As long as the data is contiguous in memory and the memory layout is either in C-order or Fortran-order, we are allowed
+to change the shape of an array/view:
+
+```cpp
+// reshape the array
+auto D_r = nda::reshape(D, 2, 6);
+std::cout << "D_r = " << D_r << std::endl;
+
+// reshape the view
+auto D_tr = nda::reshape(D_t, 2, 6);
+std::cout << "D_tr = " << D_tr << std::endl;
+```
+
+Output:
+
+```
+D_r =
+[[0,1,2,3,4,5]
+ [6,7,8,9,10,11]]
+D_tr =
+[[0,2,4,6,8,10]
+ [1,3,5,7,9,11]]
+```
+
+To interpret some array/view as a contiguous 1-dimensional range, we can call nda::flatten which is just a convenient
+wrapper around nda::reshape:
+
+```cpp
+// flatten the view
+auto D_t_flat = nda::flatten(D_t);
+std::cout << "D_t_flat = " << D_t_flat << std::endl;
+
+// flatten the array using reshape
+auto D_flat = nda::reshape(D, D.size());
+std::cout << "D_flat = " << D_flat << std::endl;
+```
+
+Output:
+
+```
+D_t_flat = [0,1,2,3,4,5,6,7,8,9,10,11]
+D_flat = [0,1,2,3,4,5,6,7,8,9,10,11]
+```
+
+**nda** provides some more advanced transformations which are especially useful for higher-dimensional arrays/views.
+We refere the interested user to the @ref documentation.
diff --git a/doc/ex5.md b/doc/ex5.md
new file mode 100644
index 000000000..956c8fdb2
--- /dev/null
+++ b/doc/ex5.md
@@ -0,0 +1,404 @@
+@page ex5 Example 5: HDF5 support
+
+[TOC]
+
+In this example, we show how to write/read **nda** arrays and views to/from HDF5 files.
+
+**nda** uses the [h5](https://triqs.github.io/h5/unstable/index.html) library and especially the
+[h5::array_interface](https://triqs.github.io/h5/unstable/group__rw__arrayinterface.html) to provide HDF5 support.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <nda/h5.hpp>
+#include <h5/h5.hpp>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // HDF5 file
+  h5::file file("ex5.h5", 'w');
+
+  // code snippets go here ...
+}
+```
+
+Before we dive into the HDF5 capabilities of **nda**, let us first specify the array that we will be working with:
+
+```cpp
+// original array
+auto A = nda::array<int, 2>(5, 5);
+for (int i = 0; auto &x : A) x = i++;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[0,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+> **Note**: In the examples below, we will restrict ourselves to arrays and views in C-order. The reason is that when
+> writing/reading to/from HDF5, the interface always checks if the arrays/views are in C-order. If this is not the case,
+> it will use a temporary C-order array to perform the writing/reading.
+
+@subsection ex5_p1 Writing an array/view
+
+Writing an array to an HDF5 file is as simple as
+
+```cpp
+// write the array to the file
+h5::write(file, "A", A);
+```
+
+Dumping the HDF5 file gives
+
+```
+HDF5 "ex5.h5" {
+GROUP "/" {
+   DATASET "A" {
+      DATATYPE  H5T_STD_I32LE
+      DATASPACE  SIMPLE { ( 5, 5 ) / ( 5, 5 ) }
+      DATA {
+      (0,0): 0, 1, 2, 3, 4,
+      (1,0): 5, 6, 7, 8, 9,
+      (2,0): 10, 11, 12, 13, 14,
+      (3,0): 15, 16, 17, 18, 19,
+      (4,0): 20, 21, 22, 23, 24
+      }
+   }
+}
+}
+```
+
+The array is written into a newly created dataset with a dataspace that has the same dimensions and the same shape as
+the original array.
+In this case, it is a 5-by-5 dataspace.
+
+When writing to HDF5, the interface doesn't distinguish between arrays and views.
+We could have done the same with a view:
+
+```cpp
+// write a view to HDF5
+h5::write(file, "A_v", A(nda::range(0, 5, 2), nda::range(0, 5, 2)));
+```
+
+Dumping the corresponding HDF5 dataspace gives
+
+```
+HDF5 "ex5.h5" {
+DATASET "/A_v" {
+   DATATYPE  H5T_STD_I32LE
+   DATASPACE  SIMPLE { ( 3, 3 ) / ( 3, 3 ) }
+   DATA {
+   (0,0): 0, 2, 4,
+   (1,0): 10, 12, 14,
+   (2,0): 20, 22, 24
+   }
+}
+}
+```
+
+Here, we write a view of every other row and column of the original array.
+Again, the created dataspace `A_v` has the same dimensions and shape as the object that is written.
+In this case, a 3-by-3 view.
+
+> **Note**: nda::h5_write takes a fourth parameter which determines if the data should be compressed before it is
+> written. By default, this is set to `true`. To turn the compression off, one can specify it in the `h5::write` call,
+> e.g
+> ```cpp
+> h5::write(file, "A", A, /* compression off */ false);
+> ```
+
+@subsection ex5_p2 Reading into an array/view
+
+Reading a full dataset into an array is straightforward:
+
+```cpp
+// read a dataset into an array
+auto A_r = nda::array<int, 2>();
+h5::read(file, "A", A_r);
+std::cout << "A_r = " << A_r << std::endl;
+```
+
+Output:
+
+```
+A_r =
+[[0,1,2,3,4]
+ [5,6,7,8,9]
+ [10,11,12,13,14]
+ [15,16,17,18,19]
+ [20,21,22,23,24]]
+```
+
+As you can see, the array does not need to have the same shape as the dataset, since the nda::h5_read function will
+resize it if needed.
+
+The same is not true for views.
+Views cannot be resized, so when we read into a view, we have to make sure that it has the correct shape (otherwise an
+exception will be thrown):
+
+```cpp
+// read a dataset into a view
+auto B = nda::zeros<int>(5, 5);
+auto B_v = B(nda::range(1, 4), nda::range(0, 5, 2));
+h5::read(file, "A_v", B_v);
+std::cout << "B = " << B << std::endl;
+```
+
+Output:
+
+```
+B =
+[[0,0,0,0,0]
+ [0,0,2,0,4]
+ [10,0,12,0,14]
+ [20,0,22,0,24]
+ [0,0,0,0,0]]
+```
+
+Here, we read the 3-by-3 dataset `A_v` into a view `B_v` consisting of every other column and the rows 1, 2 and 3 of the
+underlying 5-by-5 array `B`.
+
+@subsection ex5_p3 Writing to a slice of an existing dataset
+
+So far we have only written to an automatically created dataset with exactly the same size and shape as the array/view
+that is being written.
+It is also possible to write to a slice of an existing dataset as long as the selected slice has the same shape and size
+as the array/view.
+
+To demonstrate this, let us first create a dataset and zero it out (in production code, one would probably call the HDF5
+C library directly to create a dataspace and a dataset but this is not needed for this simple example):
+
+```cpp
+// prepare a dataset
+h5::write(file, "B", nda::zeros<int>(5, 5));
+```
+
+Dumping this dataset gives
+
+```
+HDF5 "ex5.h5" {
+DATASET "/B" {
+   DATATYPE  H5T_STD_I32LE
+   DATASPACE  SIMPLE { ( 5, 5 ) / ( 5, 5 ) }
+   DATA {
+   (0,0): 0, 0, 0, 0, 0,
+   (1,0): 0, 0, 0, 0, 0,
+   (2,0): 0, 0, 0, 0, 0,
+   (3,0): 0, 0, 0, 0, 0,
+   (4,0): 0, 0, 0, 0, 0
+   }
+}
+}
+```
+
+Then, we can take a slice of this dataset, e.g. by specifying every other row:
+
+```cpp
+// a slice that specifies every other row of "B"
+auto slice_r024 = std::make_tuple(nda::range(0, 5, 2), nda::range::all);
+```
+
+Let's write the first 3 rows of `A` to this slice:
+
+```cpp
+// write the first 3 rows of A to the slice
+h5::write(file, "B", A(nda::range(0, 3), nda::range::all), slice_r024);
+```
+
+Dumping the dataset gives
+
+```
+HDF5 "ex5.h5" {
+DATASET "/B" {
+   DATATYPE  H5T_STD_I32LE
+   DATASPACE  SIMPLE { ( 5, 5 ) / ( 5, 5 ) }
+   DATA {
+   (0,0): 0, 1, 2, 3, 4,
+   (1,0): 0, 0, 0, 0, 0,
+   (2,0): 5, 6, 7, 8, 9,
+   (3,0): 0, 0, 0, 0, 0,
+   (4,0): 10, 11, 12, 13, 14
+   }
+}
+}
+```
+
+Under the hood, **nda** takes the slice and transforms it into an HDF5 hyperslab to which the data is then written.
+
+If we write the remaining last 2 rows of `A` to the empty rows in `B`,
+
+```cpp
+// write the last 2 rows of A to the empty rows in "B"
+auto slice_r13 = std::make_tuple(nda::range(1, 5, 2), nda::range::all);
+h5::write(file, "B", A(nda::range(3, 5), nda::range::all), slice_r13);
+```
+
+we get
+
+```
+HDF5 "ex5.h5" {
+DATASET "/B" {
+   DATATYPE  H5T_STD_I32LE
+   DATASPACE  SIMPLE { ( 5, 5 ) / ( 5, 5 ) }
+   DATA {
+   (0,0): 0, 1, 2, 3, 4,
+   (1,0): 15, 16, 17, 18, 19,
+   (2,0): 5, 6, 7, 8, 9,
+   (3,0): 20, 21, 22, 23, 24,
+   (4,0): 10, 11, 12, 13, 14
+   }
+}
+}
+```
+
+@subsection ex5_p4 Reading a slice from an existing dataset
+
+Instead of reading the full dataset as we have done before, it is possible to specify a slice of the dataset that should
+be read.
+
+We can reuse the slices from above to first read the rows 0, 2, and 4 and then the rows 1 and 3 of `A` into the first
+3 and the last 2 rows of a 5-by-5 array, respectively:
+
+```cpp
+// read every other row of "A" into the first 3 rows of C
+auto C = nda::zeros<int>(5, 5);
+auto C_r012 = C(nda::range(0, 3), nda::range::all);
+h5::read(file, "A", C_r012, slice_r024);
+std::cout << "C = " << C << std::endl;
+
+// read rows 1 and 3 of "A" into the empty 2 rows of C
+auto C_r13 = C(nda::range(3, 5), nda::range::all);
+h5::read(file, "A", C_r13, slice_r13);
+std::cout << "C = " << C << std::endl;
+```
+
+Output:
+
+```
+C =
+[[0,1,2,3,4]
+ [10,11,12,13,14]
+ [20,21,22,23,24]
+ [0,0,0,0,0]
+ [0,0,0,0,0]]
+C =
+[[0,1,2,3,4]
+ [10,11,12,13,14]
+ [20,21,22,23,24]
+ [5,6,7,8,9]
+ [15,16,17,18,19]]
+```
+
+@subsection ex5_p5 Writing/Reading 1-dimensional arrays/views of strings
+
+For the user, writing and reading an 1-dimensional array/view of strings works exactly the same way as with an
+array/view of arithmetic scalars:
+
+```cpp
+// write an array of strings
+auto S = nda::array<std::string, 1>{"Hi", "my", "name", "is", "John"};
+h5::write(file, "S", S);
+
+// read an array of strings
+auto S_r = nda::array<std::string, 1>();
+h5::read(file, "S", S_r);
+std::cout << "S_r = " << S_r << std::endl;
+```
+
+Output:
+
+```
+S_r = [Hi,my,name,is,John]
+```
+
+The dumped HDF5 dataset gives
+
+```
+HDF5 "ex5.h5" {
+DATASET "/S" {
+   DATATYPE  H5T_STRING {
+      STRSIZE 5;
+      STRPAD H5T_STR_NULLTERM;
+      CSET H5T_CSET_UTF8;
+      CTYPE H5T_C_S1;
+   }
+   DATASPACE  SIMPLE { ( 5 ) / ( 5 ) }
+   DATA {
+   (0): "Hi", "my", "name", "is", "John"
+   }
+}
+}
+```
+
+@subsection ex5_p6 Writing/Reading arrays/views of generic types
+
+**nda** allows us to write/read arbitrary arrays/views as long as the objects contained in the array have specialized
+`h5_write` and `h5_read` functions (see [h5 docs](https://triqs.github.io/h5/unstable/group__rw__generic.html)).
+
+For example, an array of integer arrays can be written/read as
+
+```cpp
+// write an array of integer arrays
+auto I = nda::array<nda::array<int, 1>, 1>{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}};
+h5::write(file, "I", I);
+
+// read an array of integer arrays
+auto I_r = nda::array<nda::array<int, 1>, 1>();
+h5::read(file, "I", I_r);
+std::cout << "I_r = " << I_r << std::endl;
+```
+
+Output:
+
+```
+I_r = [[0,1,2],[3,4,5],[6,7,8]]
+```
+
+Dumping the corresponding HDF5 group gives
+
+```
+HDF5 "ex5.h5" {
+GROUP "/I" {
+   DATASET "0" {
+      DATATYPE  H5T_STD_I32LE
+      DATASPACE  SIMPLE { ( 3 ) / ( 3 ) }
+      DATA {
+      (0): 0, 1, 2
+      }
+   }
+   DATASET "1" {
+      DATATYPE  H5T_STD_I32LE
+      DATASPACE  SIMPLE { ( 3 ) / ( 3 ) }
+      DATA {
+      (0): 3, 4, 5
+      }
+   }
+   DATASET "2" {
+      DATATYPE  H5T_STD_I32LE
+      DATASPACE  SIMPLE { ( 3 ) / ( 3 ) }
+      DATA {
+      (0): 6, 7, 8
+      }
+   }
+   DATASET "shape" {
+      DATATYPE  H5T_STD_I64LE
+      DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
+      DATA {
+      (0): 3
+      }
+   }
+}
+}
+```
+
+Now, `I` is an HDF5 group and not a dataset and each object of the array, i.e. each integer array, is written to its
+own dataset with a name corresponding to its index in the array.
+In this case, "0", "1" and "2".
diff --git a/doc/ex6.md b/doc/ex6.md
new file mode 100644
index 000000000..9c0fa7d7f
--- /dev/null
+++ b/doc/ex6.md
@@ -0,0 +1,440 @@
+@page ex6 Example 6: MPI support
+
+[TOC]
+
+In this example, we show how to scatter, gather, broadcast and reduce **nda** arrays and views across MPI processes.
+
+**nda** uses the [mpi](https://triqs.github.io/mpi/unstable/index.html) library to provide MPI support.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <nda/mpi.hpp>
+#include <mpi/mpi.hpp>
+#include <iostream>
+
+template <typename A>
+void print(const A &arr, const mpi::communicator &comm) {
+  std::cout << "Rank " << comm.rank() << ": " << arr << std::endl;
+}
+
+int main(int argc, char *argv[]) {
+  // initialize MPI environment and communicator
+  mpi::environment env(argc, argv);
+  mpi::communicator comm;
+  const int root = 0;
+
+  // code snippets go here ...
+}
+```
+
+The examples below are run on 4 processes.
+
+> **Note**: Only regular arrays and views are allowed in the **nda** MPI routines, no lazy expressions. You can use
+> nda::make_regular to turn your lazy expressions into regular arrays.
+
+@subsection ex6_p1 Broadcasting an array/view
+
+Let us first default construct an array on all MPI ranks and then resize and initialize it on rank 0, the root rank:
+
+```cpp
+// create an array and initialize it on root
+auto A = nda::array<int, 2, nda::F_layout>();
+if (comm.rank() == root) {
+  A.resize(2, comm.size());
+  for (int i = 0; auto &x : A) x = i++;
+}
+print(A, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[]
+Rank 3:
+[]
+Rank 0:
+[[0,2,4,6]
+ [1,3,5,7]]
+Rank 1:
+[]
+```
+
+As you can see, the array is only initialized on root.
+The reason why we use Fortran-order will be explained below.
+
+> **Note**: The `comm.barrier()` is not necessary. It makes sure that the `print` statements are executed and finished
+> before we proceed.
+
+Broadcasting the array from the root to all other ranks is easy:
+
+```cpp
+// broadcast the array from root to all other ranks
+mpi::broadcast(A, comm);
+print(A, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[[0,2,4,6]
+ [1,3,5,7]]
+Rank 3:
+[[0,2,4,6]
+ [1,3,5,7]]
+Rank 0:
+[[0,2,4,6]
+ [1,3,5,7]]
+Rank 1:
+[[0,2,4,6]
+ [1,3,5,7]]
+```
+
+Although the output might not be in order, it is clear that the array is the same on all ranks after broadcasting.
+
+Also note that the `broadcast` call automatically resizes arrays if necessary.
+This is not true for views since they cannot be resized.
+The caller is responsible for making sure that the shapes of the views are correct (otherwise an exception is thrown).
+
+Before we broadcast a view, let us again prepare our arrays:
+
+```cpp
+// prepare the array
+A = 0;
+if (comm.rank() == root) {
+  A(nda::range::all, root) = 1;
+}
+print(A, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 0:
+[[1,0,0,0]
+ [1,0,0,0]]
+Rank 2:
+[[0,0,0,0]
+ [0,0,0,0]]
+Rank 1:
+[[0,0,0,0]
+ [0,0,0,0]]
+Rank 3:
+[[0,0,0,0]
+ [0,0,0,0]]
+```
+
+Now we want to broadcast the first column of the array on root to the \f$ i^{th} \f$ column of the array on rank
+\f$ i \f$:
+
+```cpp
+// broadcast the first column from root to all other ranks
+auto A_v = A(nda::range::all, comm.rank());
+mpi::broadcast(A_v, comm);
+print(A, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 1:
+[[0,1,0,0]
+ [0,1,0,0]]
+Rank 0:
+[[1,0,0,0]
+ [1,0,0,0]]
+Rank 2:
+[[0,0,1,0]
+ [0,0,1,0]]
+Rank 3:
+[[0,0,0,1]
+ [0,0,0,1]]
+```
+
+If `A` would not be in Fortran-order, this code snippet wouldn't compile since the elements of a single column are not
+contiguous in memory in C-order.
+
+> **Note**: All MPI routines have certain requirements for the arrays/views involved in the operation. Please check out
+> the documentation of the individual function, e.g. in this case nda::mpi_broadcast, if you have doubts.
+
+@subsection ex6_p2 Gathering an array/view
+
+Suppose we have a 1-dimensional array with rank specific elements and sizes:
+
+```cpp
+// prepare the array to be gathered
+auto B = nda::array<int, 1>(comm.rank() + 1);
+B() = comm.rank();
+print(B, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 1: [1,1]
+Rank 3: [3,3,3,3]
+Rank 0: [0]
+Rank 2: [2,2,2]
+```
+
+Let us gather the 1-dimensional arrays on the root process:
+
+```cpp
+// gather the arrays on root
+auto B_g = mpi::gather(B, comm, root);
+print(B_g, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2: []
+Rank 3: []
+Rank 0: [0,1,1,2,2,2,3,3,3,3]
+Rank 1: []
+```
+
+Only the root process will have the gathered result available.
+If the other ranks need access to the result, we can do an all-gather instead:
+
+```cpp
+// all-gather the arrays
+auto B_g = mpi::gather(B, comm, root, /* all */ true);
+```
+
+which is equivalent to
+
+```cpp
+auto B_g = mpi::all_gather(B, comm);
+```
+
+Views work exactly the same way.
+For example, let us gather 2-dimensional reshaped views of 1-dimensional arrays:
+
+```cpp
+// resize and reshape the arrays and gather the resulting views
+B.resize(4);
+B = comm.rank();
+auto B_r = nda::reshape(B, 2, 2);
+auto B_rg = mpi::gather(B_r, comm, root);
+print(B_rg, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 1:
+[]
+Rank 0:
+[[0,0]
+ [0,0]
+ [1,1]
+ [1,1]
+ [2,2]
+ [2,2]
+ [3,3]
+ [3,3]]
+Rank 2:
+[]
+Rank 3:
+[]
+```
+
+Higher-dimensional arrays/views are always gathered along the first dimension.
+The extent of the first dimension does not have to be the same on all ranks.
+For example, if we gather \f$ M \f$ arrays of size \f$ N_1^{(i)} \times \dots \times N_d \f$, then the resulting array
+will be of size \f$ \prod_{i=1}^M N_1^{(i)} \times \dots \times N_d \f$ .
+
+nda::mpi_gather requires all arrays/views to be in C-order.
+Other memory layouts have to be transposed or permuted (see nda::transpose and the more general
+nda::permuted_indices_view) before they can be gathered.
+
+```cpp
+// gather Fortran-layout arrays
+auto B_f = nda::array<int, 2, nda::F_layout>(2, 2);
+B_f = comm.rank();
+auto B_fg = mpi::gather(nda::transpose(B_f), comm, root);
+print(B_fg, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 1:
+[]
+Rank 3:
+[]
+Rank 0:
+[[0,0]
+ [0,0]
+ [1,1]
+ [1,1]
+ [2,2]
+ [2,2]
+ [3,3]
+ [3,3]]
+Rank 2:
+[]
+```
+
+The resulting array `B_fg` has to be in C-order.
+If we want it to be in Fortran-order, we can simply transpose it:
+
+```cpp
+// transpose the result
+print(nda::transpose(B_fg), comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[]
+Rank 0:
+[[0,0,1,1,2,2,3,3]
+ [0,0,1,1,2,2,3,3]]
+Rank 1:
+[]
+Rank 3:
+[]
+```
+
+@subsection ex6_p3 Scattering an array/view
+
+Scattering of an array/view is basically the inverse operation of gathering.
+It takes an array/view and splits it along the first dimensions as evenly as possible among the processes.
+
+For example, to scatter the same array that we just gathered, we can do
+
+```cpp
+// scatter an array from root to all other ranks
+nda::array<int, 2> C = mpi::scatter(B_rg, comm, root);
+print(C, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[[2,2]
+ [2,2]]
+Rank 1:
+[[1,1]
+ [1,1]]
+Rank 0:
+[[0,0]
+ [0,0]]
+Rank 3:
+[[3,3]
+ [3,3]]
+```
+
+Similar to the nda::mpi_gather, nda::mpi_scatter requires the input and output arrays to be in C-order.
+
+If the extent along the first dimension is not divisible by the number of MPI ranks, processes with lower ranks will
+receive more data than others:
+
+```cpp
+// scatter an array with extents not divisible by the number of ranks
+auto C_s = mpi::scatter(C, comm, 2);
+print(C_s, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[]
+Rank 0:
+[[2,2]]
+Rank 3:
+[]
+Rank 1:
+[[2,2]]
+```
+
+Here, a 2-by-2 array is scattered from rank 2 to all other processes.
+It is split along the first dimension and the resulting 1-by-2 subarrays are sent to the ranks 0 and 1 while the ranks
+2 and 3 do not receive any data.
+
+@subsection ex6_p4 Reducing an array/view
+
+Let us reduce the same 2-by-2 arrays from above.
+Be default, `mpi::reduce` performs an element-wise summation among the ranks in the communicator and makes the result
+available only on the root process:
+
+```cpp
+// reduce an array on root using MPI_SUM
+auto D = mpi::reduce(C, comm, root);
+print(D, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[]
+Rank 3:
+[]
+Rank 1:
+[]
+Rank 0:
+[[6,6]
+ [6,6]]
+```
+
+To use a different reduction operation or to send the result to all ranks, we can do
+
+```cpp
+auto D = mpi::reduce(C, comm, root, /* all */ true, MPI_OP);
+```
+
+or the more compact
+
+```cpp
+auto D = mpi::all_reduce(C, comm, root, MPI_OP);
+```
+
+It is also possible to perform the reduction in-place, i.e. the result is written to the input array:
+
+```cpp
+// all-reduce an array in-place
+mpi::all_reduce_in_place(C, comm);
+print(C, comm);
+comm.barrier();
+```
+
+Output:
+
+```
+Rank 2:
+[[6,6]
+ [6,6]]
+Rank 0:
+[[6,6]
+ [6,6]]
+Rank 3:
+[[6,6]
+ [6,6]]
+Rank 1:
+[[6,6]
+ [6,6]]
+```
+
+In contrast to the standard `mpi::all_reduce` function, the in-place operation does not create and return a new array.
+Instead the result is directly written into the input array.
diff --git a/doc/ex7.md b/doc/ex7.md
new file mode 100644
index 000000000..50537da9c
--- /dev/null
+++ b/doc/ex7.md
@@ -0,0 +1,314 @@
+@page ex7 Example 7: Making use of symmetries
+
+[TOC]
+
+In this example, we show how use, spot and define symmetries in **nda** arrays.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <nda/sym_grp.hpp>
+#include <complex>
+#include <functional>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // size of the matrix
+  constexpr int N = 3;
+
+  // some useful typedefs
+  using idx_t = std::array<long, 2>;
+  using sym_t = std::tuple<idx_t, nda::operation>;
+  using sym_func_t = std::function<sym_t(idx_t const &)>;
+
+  // code snippets go here ...
+}
+```
+
+Here, we have defined some basic quantities/types:
+- \f$ N \times N \f$ is the size of the matrix that we are working with.
+- `idx_t` defines a multi-dimensional index type (in our case it is 2-dimensional).
+- `sym_t` is a tuple type consisting of a multi-dimensional index and an nda::operation.
+- `sym_func_t` is a callable type that takes a multi-dimensional index and returns a `sym_t` object. The index in the
+`sym_t` object refers to an element which is related to the element at the original index by the nda::operation.
+
+Let's take a look at a simple example.
+
+@subsection ex7_p1 Defining the symmetry
+
+We start by defining a hermitian symmetry for our matrix.
+A symmetry is simply an object of type `sym_func_t` that specifies how elements in an array are related to one another.
+
+```cpp
+// define a hermitian symmetry (satisfies nda::NdaSymmetry)
+auto h_symmetry = [](idx_t const &x) {
+  if (x[0] == x[1]) return sym_t{x, nda::operation{false, false}}; // sign flip = false, complex conjugate = false
+  return sym_t{idx_t{x[1], x[0]}, nda::operation{false, true}};    // sign flip = false, complex conjugate = true
+};
+```
+
+As you can see, `h_symmetry` is a callable object that takes an `idx_t` and returns a `sym_t` object.
+The two lines have the following meaning:
+1. Diagonal elements are not related to any other elements (except to themselves by the identity operation).
+2. Off-diagonal elements with an index `(i,j)` are related to the element `(j,i)` by a complex conjugation.
+
+@subsection ex7_p2 Constructing the symmetry group
+
+Once all of the desired symmetries have been defined, we can construct an nda::sym_grp object:
+
+```cpp
+// construct the symmetry group
+nda::array<std::complex<double>, 2> A(N, N);
+auto grp = nda::sym_grp{A, std::vector<sym_func_t>{h_symmetry}};
+```
+
+The constructor of the symmetry group takes an array and a list of symmetries as input.
+It uses the shape of the array and the symmetries to find all symmetry classes.
+
+For our \f$ 3 \times 3 \f$ hermitian matrix, there should be 6 symmetry classes corresponding to the 6 independent
+elements of the matrix:
+
+```cpp
+// check the number of symmetry classes
+std::cout << "Number of symmetry classes: " << grp.num_classes() << std::endl;
+```
+
+Output:
+
+```
+Number of symmetry classes: 6
+```
+
+A symmetry class contains all the elements that are related to each other by some symmetry.
+It stores the flat index of the element as well as the nda::operation specifying how it is related to a special
+representative element of the class.
+In practice, the representative element is simply the first element of the class.
+
+Let us print all the symmetry classes:
+
+```cpp
+// print the symmetry classes
+for (int i = 1; auto const &c : grp.get_sym_classes()) {
+  std::cout << "Symmetry class " << i << ":" << std::endl;
+  for (auto const &x : c) {
+    std::cout << "  Idx: " << x.first << ", Sign flip: " << x.second.sgn << ", Complex conjugation: " << x.second.cc << std::endl;
+  }
+}
+```
+
+Output:
+
+```
+Symmetry class 1:
+  Idx: 0, Sign flip: 0, Complex conjugation: 0
+Symmetry class 1:
+  Idx: 1, Sign flip: 0, Complex conjugation: 0
+  Idx: 3, Sign flip: 0, Complex conjugation: 1
+Symmetry class 1:
+  Idx: 2, Sign flip: 0, Complex conjugation: 0
+  Idx: 6, Sign flip: 0, Complex conjugation: 1
+Symmetry class 1:
+  Idx: 4, Sign flip: 0, Complex conjugation: 0
+Symmetry class 1:
+  Idx: 5, Sign flip: 0, Complex conjugation: 0
+  Idx: 7, Sign flip: 0, Complex conjugation: 1
+Symmetry class 1:
+  Idx: 8, Sign flip: 0, Complex conjugation: 0
+```
+
+Keep in mind that elements of our matrix have the following flat indices:
+
+```cpp
+// print flat indices
+nda::array<int, 2> B(3, 3);
+for (int i = 0; auto &x : B) x = i++;
+std::cout << B << std::endl;
+```
+
+Output:
+
+```
+[[0,1,2]
+ [3,4,5]
+ [6,7,8]]
+```
+
+Now we can see why the elements 0, 4 and 8 are in their own symmetry class and elements 1 and 3, 2 and 6 and elements 5
+and 7 are related to each other via a complex conjugation.
+
+@subsection ex7_p3 Initializing an array
+
+One of the features of symmetry groups is that they can be used to initialize or assign to an existing array with an
+initializer function satisfying the nda::NdaInitFunc concept:
+
+```cpp
+// define an initializer function
+int count_eval = 0;
+auto init_func = [&count_eval, &B](idx_t const &idx) {
+  ++count_eval;
+  const double val = B(idx[0], idx[1]);
+  return std::complex<double>{val, val};
+};
+```
+
+An nda::NdaInitFunc object should take a multi-dimensional index as input and return a value for the given index.
+Since we are working with complex matrices, the return type in our case is `std::complex<double>`.
+
+`count_eval` simply counts the number of times `init_func` has been called.
+
+Let's initialize our array:
+
+```cpp
+// initialize the array using the symmetry group
+grp.init(A, init_func);
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[(0,0),(1,1),(2,2)]
+ [(1,-1),(4,4),(5,5)]
+ [(2,-2),(5,-5),(8,8)]]
+```
+
+As expected the array is hermitian.
+
+The advantage of using symmetry groups to initialize an array becomes clear when we check how often our `init_func`
+object has been called:
+
+```cpp
+// check the number of evaluations
+std::cout << "Number of evaluations: " << count_eval << std::endl;
+```
+
+Output:
+
+```
+Number of evaluations: 6
+```
+
+It has been called exactly once for each symmetry class.
+
+In our simple example, this does not really make a difference but for large arrays where the number of symmetry classes
+is much smaller than the size of the array and where evaluating `init_func` is expensive, this can give a considerable
+speedup.
+
+@subsection ex7_p4 Representative data
+
+We have already learned above about representative elements of symmetry classes.
+The symmetry group object gives us access to those elements:
+
+```cpp
+// get representative elements
+auto reps = grp.get_representative_data(A);
+auto reps_view = nda::array_view<std::complex<double>, 1>(reps);
+std::cout << "Representative elements = " << reps_view << std::endl;
+```
+
+Output:
+
+```
+Representative elements = [(0,0),(1,1),(2,2),(4,4),(5,5),(8,8)]
+```
+
+The representative elements in our case simply belong to the upper triangular part of the matrix.
+
+Representative data can also be used to initialize or assign to an array.
+Let's try this out:
+
+```cpp
+// use representative data to initialize a new array
+reps_view *= 2.0;
+nda::array<std::complex<double>, 2> B(N, N);
+grp.init_from_representative_data(B, reps);
+std::cout << "B = " << B << std::endl;
+```
+
+Output:
+
+```
+B =
+[[(0,0),(2,2),(4,4)]
+ [(2,-2),(8,8),(10,10)]
+ [(4,-4),(10,-10),(16,16)]]
+```
+
+Here, we first multiplied the original representative data by 2 and then initialized a new array with it.
+
+@subsection ex7_p5 Symmetrizing an array
+
+The nda::sym_grp class provides a method that let's us symmetrize an existing array and simultaneously obtain the
+maximum symmetry violation.
+
+For each symmetry class, it first calculates the representative element by looping over all elements of the class,
+applying the stored nda::operation to each element (for a sign flip and a complex conjugation this is equal to applying
+the inverse operation) and summing the resulting values.
+The sum divided by the number of elements in the symmetry class gives us the representative element.
+
+It then compares each element in the class to the representative element, notes their absolute difference to obtain
+the maximum symmetry violation and sets the element to the value of the representative element (modulo the
+nda::operation).
+
+Symmetrizing one of our already hermitian matrices shouldn't change the matrix and the maximum symmetry violation should
+be zero:
+
+```cpp
+// symmetrize an already symmetric array
+auto v1 = grp.symmetrize(A);
+std::cout << "Symmetrized A = " << A << std::endl;
+std::cout << "Max. violation at index " << v1.first << " = " << v1.second << std::endl;
+```
+
+Output:
+
+```
+Symmetrized A =
+[[(0,0),(1,1),(2,2)]
+ [(1,-1),(4,4),(5,5)]
+ [(2,-2),(5,-5),(8,8)]]
+Max. violation at index (0 0) = 0
+```
+
+Let's change one of the off-diagonal elements:
+
+```cpp
+// change an off-diagonal element
+A(0, 2) *= 2.0;
+std::cout << "A = " << A << std::endl;
+```
+
+Output:
+
+```
+A =
+[[(0,0),(1,1),(4,4)]
+ [(1,-1),(4,4),(5,5)]
+ [(2,-2),(5,-5),(8,8)]]
+```
+
+The matrix is not hermitian anymore and therefore violates our initial `h_symmetry`.
+
+If we symmetrize again:
+
+```cpp
+// symmetrize again
+auto v2 = grp.symmetrize(A);
+std::cout << "Symmetrized A = " << A << std::endl;
+std::cout << "Max. violation at index " << v2.second << " = " << v2.first << std::endl;
+```
+
+Output:
+
+```
+Symmetrized A =
+[[(0,0),(1,1),(3,3)]
+ [(1,-1),(4,4),(5,5)]
+ [(3,-3),(5,-5),(8,8)]]
+Max. violation at index (0 2) = 1.41421
+```
+
+We can see that the symmetrized matrix is now different from the original and that the maximum symmetry violation is
+not equal to zero.
diff --git a/doc/ex8.md b/doc/ex8.md
new file mode 100644
index 000000000..42b825d2b
--- /dev/null
+++ b/doc/ex8.md
@@ -0,0 +1,411 @@
+@page ex8 Example 8: Linear algebra support
+
+[TOC]
+
+In this example, we show how do linear algebra with **nda** arrays.
+
+All the following code snippets are part of the same `main` function:
+
+```cpp
+#include <nda/nda.hpp>
+#include <complex>
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  // code snippets go here ...
+}
+```
+
+Before showing some linear algebra related operations, let us first introduce the nda::matrix and nda::vector types and
+highlight their similarities and differences with respect to nda::array.
+
+@subsection ex8_p1 Matrix and vector types
+
+As already mentioned in the quick introduction @ref ex1_p11, **nda** provides an nda::matrix and an nda::vector type.
+Both are specializations of the general nda::basic_array with the following features:
+- nda::matrix is a 2-dimensional array that belongs to the ``'M'`` algebra.
+- nda::vector is a 1-dimensional array that belongs to the ``'V'`` algebra.
+
+Their algebras determine how matrices and vectors behave in certain operations and expressions.
+
+Otherwise, everything that is true for nda::basic_array objects is also true for nda::matrix and nda::vector objects.
+
+@subsection ex8_p2 Constructing matrices and vectors
+
+To construct a matrix or a vector, we can use the same methods as for nda::array objects.
+We refer the user to @ref ex2 for more information.
+
+For example, to construct a matrix and a vector of a certain shape, we can do
+
+```cpp
+// construct a 4x3 matrix and a vector of size 3
+nda::matrix<double> M(4, 3);
+nda::vector<double> v(3);
+```
+
+Here, a \f$ 4 \times 3 \f$ matrix and a vector of size \f$ 3 \f$ are created.
+
+Let us note that there are some @ref av_factories that are specific to matrices:
+
+```cpp
+// construct a 3x3 identity matrix
+auto I = nda::eye<double>(3);
+std::cout << "I = " << I << std::endl;
+
+// construct a 2x2 diagonal matrix
+auto D = nda::diag(nda::array<int, 1>{1, 2});
+std::cout << "D = " << D << std::endl;
+```
+
+Output:
+
+```
+I =
+[[1,0,0]
+ [0,1,0]
+ [0,0,1]]
+D =
+[[1,0]
+ [0,2]]
+```
+
+nda::eye constructs an identity matrix of a certain size and nda::diag takes a 1-dimensional array and constructs a
+square diagonal matrix containing the values of the given array.
+
+@subsection ex8_p3 Initializing and assigning to matrices and vectors
+
+Again, initializing and assigning to matrices and vectors works (almost) exactly in the same way as it does for arrays
+(see @ref ex3)
+
+The main difference occurs, when we are assigning a scalar. While for arrays and vectors the assignment is done
+element-wise, for matrices the scalar is assigned only to the elements on the shorter diagonal and the rest is zeroed
+out:
+
+```cpp
+// assigning a scalar to a matrix and vector
+M = 3.1415;
+v = 2.7182;
+std::cout << "M = " << M << std::endl;
+std::cout << "v = " << v << std::endl;
+```
+
+Output:
+
+```
+M =
+[[3.1415,0,0]
+ [0,3.1415,0]
+ [0,0,3.1415]
+ [0,0,0]]
+v = [2.7182,2.7182,2.7182]
+```
+
+@subsection ex8_p4 Views on matrices and vectors
+
+There are some @ref av_factories for views that are specific to matrices and vectors, otherwise everything mentioned in
+@ref ex4 still applies:
+
+```cpp
+// get a vector view of the diagonal of a matrix
+auto d = nda::diagonal(D);
+std::cout << "d = " << d << std::endl;
+std::cout << "Algebra of d: " << nda::get_algebra<decltype(d)> << std::endl;
+
+// transform an array into a matrix view
+auto A = nda::array<int, 2>{{1, 2}, {3, 4}};
+auto A_mv = nda::make_matrix_view(A);
+std::cout << "A_mv = " << A_mv << std::endl;
+std::cout << "Algebra of A: " << nda::get_algebra<decltype(A)> << std::endl;
+std::cout << "Algebra of A_mv: " << nda::get_algebra<decltype(A_mv)> << std::endl;
+```
+
+Output:
+
+```
+d = [1,2]
+Algebra of d: V
+A_mv =
+[[1,2]
+ [3,4]]
+Algebra of A: A
+Algebra of A_mv: M
+```
+
+@subsection ex8_p5 HDF5, MPI and symmetry support for matrices and vectors
+
+We refer the user to the examples
+- @ref ex5,
+- @ref ex6 and
+- @ref ex7.
+
+There are no mentionable differences between arrays, matrices and vectors regarding those features.
+
+@subsection ex8_p6 Arithmetic operations with matrices and vectors
+
+Here the algebra of the involved types becomes important.
+In section @ref ex1_p8, we have shortly introduced how arithmetic operations are implemented in **nda** in terms of lazy
+expressions and how the algebra of the operands influences the results.
+
+Let us be more complete in this section.
+Suppose we have the following objects:
+- nda::Array (any type that satisfies this concept): `O1`, `O2`, ...
+- nda::array: `A1`, `A2`, ...
+- nda::matrix: `M1`, `M2`, ...
+- nda::vector: `v1`, `v2`, ...
+- scalar: `s1`, `s2`, ...
+
+Then the following operations are allowed  (all operations are lazy unless mentioned otherwise)
+
+- **Addition** / **Subtraction**
+  - `O1 +/- O2`: element-wise addition/subtraction, shapes of `O1` and `O2` have to be the same, result has the same
+    shape
+  - `s1 +/- A1`/`A1 +/- s1`: element-wise addition/subtraction, result has the same shape as `A1`
+  - `s1 +/- v1`/`v1 +/- s1`: element-wise addition/subtraction, result has the same shape as `v1`
+  - `s1 +/- M1`/`M1 +/- s1`: scalar is added/subtracted to/from the elements on the shorter diagonal of `M1`, result has
+    the same shape as `M1`
+
+- **Multiplication**
+  - `A1 * A2`: element-wise multiplication, shapes of `A1` and `A2` have to be the same, result has the same shape
+  - `M1 * M2`: non-lazy matrix-matrix multiplication, shapes have the expected requirements for matrix multiplication,
+    calls nda::matmul
+  - `M1 * v1`: non-lazy matrix-vector multiplication, shapes have the expected requirements for matrix multiplication,
+    calls nda::matvecmul
+  - `s1 * O1`/`O1 * s1`: element-wise multiplication, result has the same shape as `O1`
+
+- **Division**
+  - `A1 / A2`: element-wise division, shapes of `A1` and `A2` have to be the same, result has the same shape
+  - `M1 / M2`: non-lazy matrix-matrix multiplication of `M1` by the inverse of `M2`, shapes have the expected
+    requirements for matrix multiplication with the additional requirement that `M2` is square (since `M2` is inverted),
+    result is also square with the same size
+  - `O1 / s1`: element-wise division, result has the same shape as `O1`
+  - `s1 / A1`: element-wise division, result has the same shape as `A1` (note this is `!= A1 / s1`)
+  - `s1 / v1`: element-wise division, result has the same shape as `v1` (note this is `!= v1 / s1`)
+  - `s1 / M1`: multiplies (lazy) the scalar with the inverse (non-lazy) of `M1`, only square matrices are allowed (since
+    `M1` is inverted), result is also square with the same size
+
+@subsection ex8_p7 Using linear algebra tools
+
+In addition to the basic matrix-matrix and matrix-vector multiplications described above, **nda** provides some useful
+@ref linalg_tools.
+While some of them have a custom implementation, e.g. nda::linalg::cross_product, most make use of the @ref linalg_blas
+and @ref linalg_lapack to call more optimized routines.
+
+Let us demonstrate some of its features:
+
+```cpp
+// take the cross product of two vectors
+auto v1 = nda::vector<double>{1.0, 2.0, 3.0};
+auto v2 = nda::vector<double>{4.0, 5.0, 6.0};
+auto v3 = nda::linalg::cross_product(v1, v2);
+std::cout << "v1 = " << v1 << std::endl;
+std::cout << "v2 = " << v2 << std::endl;
+std::cout << "v3 = v1 x v2 = " << v3 << std::endl;
+```
+
+Output:
+
+```
+v1 = [1,2,3]
+v2 = [4,5,6]
+v3 = v1 x v2 = [-3,6,-3]
+```
+
+Here, we first create two 3-dimensional vectors and then take their cross product.
+
+To check that the cross product is perpendicular to `v1` and `v2`, we can use the dot product:
+
+```cpp
+// check the cross product using the dot product
+std::cout << "v1 . v3 = " << nda::dot(v1, v3) << std::endl;
+std::cout << "v2 . v3 = " << nda::dot(v2, v3) << std::endl;
+```
+
+Output:
+
+```
+v1 . v3 = 0
+v2 . v3 = 0
+```
+
+The cross product can also be expressed as the matrix-vector product of a skew-symmetric matrix and one of the vectors:
+\f[
+  \mathbf{a} \times \mathbf{b} =
+  \begin{bmatrix}
+  0 & -a_3 & a_2 \\
+  a_3 & 0 & -a_1 \\
+  -a_2 & a_1 & 0
+  \end{bmatrix}
+  \begin{bmatrix}
+  b_1 \\
+  b_2 \\
+  b_3
+  \end{bmatrix}
+\f]
+
+```cpp
+// cross product via matrix-vector product
+auto M_v1 = nda::matrix<double>{{0, -v1[2], v1[1]}, {v1[2], 0, -v1[0]}, {-v1[1], v1[0], 0}};
+std::cout << "M_v1 = " << M_v1 << std::endl;
+auto v3_mv = M_v1 * v2;
+std::cout << "v3_mv = " << v3_mv << std::endl;
+```
+
+Output:
+
+```
+M_v1 =
+[[0,-3,2]
+ [3,0,-1]
+ [-2,1,0]]
+v3_mv = [-3,6,-3]
+```
+
+Comparing this result to `v3` above, we see that this is indeed correct.
+
+Let us now turn to an eigenvalue problem.
+**nda** offers the convenience functions nda::linalg::eigenelements and nda::linalg::eigenvalues to obtain the
+eigenvalues and eigenvectors of a symmetric or hermitian matrix.
+
+We start from the following symmetric matrix:
+
+```cpp
+// define a symmetric matrix
+auto M1 = nda::matrix<double>{{4, -14, -12}, {-14, 10, 13}, {-12, 13, 1}};
+std::cout << "M1 = " << M1 << std::endl;
+```
+
+Output:
+
+```
+M1 =
+[[4,-14,-12]
+ [-14,10,13]
+ [-12,13,1]]
+```
+
+Getting the eigenvalues and eigenvectors is quite easy:
+
+```cpp
+// calculate the eigenvalues and eigenvectors of a symmetric matrix
+auto [s, Q] = nda::linalg::eigenelements(M1);
+std::cout << "Eigenvalues of M1: s = " << s << std::endl;
+std::cout << "Eigenvectors of M1: Q = " << Q << std::endl;
+```
+
+Output:
+
+```
+Eigenvalues of M1: s = [-9.64366,-6.89203,31.5357]
+Eigenvectors of M1: Q =
+[[0.594746,-0.580953,0.555671]
+ [-0.103704,-0.740875,-0.663588]
+ [0.797197,0.337041,-0.500879]]
+```
+
+To check the correctness of our calculation, we us the fact that the matrix \f$ \mathbf{M}_1 \f$ can be factorized as
+\f[
+  \mathbf{M}_1 = \mathbf{Q} \mathbf{\Sigma} \mathbf{Q}^{-1} \; ,
+\f]
+where \f$ \mathbf{Q} \f$ is the matrix with the eigenvectors in its columns and \f$ \mathbf{\Sigma} \f$ is the diagonal
+matrix of eigenvalues:
+
+```cpp
+// check the eigendecomposition
+auto M1_reconstructed = Q * nda::diag(s) * nda::transpose(Q);
+std::cout << "M1_reconstructed = " << M1_reconstructed << std::endl;
+```
+
+Output:
+
+```
+M1_reconstructed =
+[[4,-14,-12]
+ [-14,10,13]
+ [-12,13,1]]
+```
+
+@subsection ex8_p8 Using the BLAS/LAPACK interface
+
+While the functions in @ref linalg_tools offer a very user-friendly experience, @ref linalg_blas and @ref linalg_lapack
+are more low-level and usually require more input from the users.
+
+Let us show how to use the @ref linalg_lapack to solve a system of linear equations.
+The system we want to solve is \f$ \mathbf{A}_1 \mathbf{x}_1 = \mathbf{b}_1 \f$ with
+
+```cpp
+// define the linear system of equations and the right hand side matrix
+auto A1 = nda::matrix<double, nda::F_layout>{{3, 2, -1}, {2, -2, 4}, {-1, 0.5, -1}};
+auto b1 = nda::vector<double>{1, -2, 0};
+std::cout << "A1 = " << A1 << std::endl;
+std::cout << "b1 = " << b1 << std::endl;
+```
+
+Output:
+
+```
+A1 =
+[[3,2,-1]
+ [2,-2,4]
+ [-1,0.5,-1]]
+b1 = [1,-2,0]
+```
+
+First, we perform an LU factorization using nda::lapack::getrf:
+
+```cpp
+// LU factorization using the LAPACK interface
+auto ipiv = nda::vector<int>(3);
+auto LU = A1;
+auto info = nda::lapack::getrf(LU, ipiv);
+if (info != 0) {
+  std::cerr << "Error: nda::lapack::getrf failed with error code " << info << std::endl;
+  return 1;
+}
+```
+
+Here, `ipiv` is a pivot vector that is used internally by the LAPACK routine.
+Because nda::lapack::getrf overwrites the input matrix with the result, we first copy the original matrix `A1` into a
+new one `LU`.
+
+Then we solve the system by calling nda::lapack::getrs with the LU factorization, the right hand side vector and the
+pivot vector:
+
+```cpp
+// solve the linear system of equations using the LU factorization
+nda::matrix<double, nda::F_layout> x1(3, 1);
+x1(nda::range::all, 0) = b1;
+info = nda::lapack::getrs(LU, x1, ipiv);
+if (info != 0) {
+  std::cerr << "Error: nda::lapack::getrs failed with error code " << info << std::endl;
+  return 1;
+}
+std::cout << "x1 = " << x1(nda::range::all, 0) << std::endl;
+```
+
+Output:
+
+```
+x1 = [1,-2,-2]
+```
+
+Since `b1` is a vector but nda::lapack::getrs expects a matrix, we first copy it into the matrix `x1`.
+After the LAPACK call return, `x1` contains the result in its first column.
+
+Let's check that this is actually the solution to our original system of equations:
+
+```cpp
+// check the solution
+std::cout << "A1 * x1 = " << A1 * x1(nda::range::all, 0) << std::endl;
+```
+
+Output:
+
+```
+A1 * x1 = [1,-2,2.22045e-16]
+```
+
+Considering the finite precision of our calculations, this is indeed equal to the right hand side vector `b1`.
+
+> **Note**: BLAS and LAPACK assume Fortran-ordered arrays. We therefore recommend to work with matrices in the
+> nda::F_layout to avoid any confusion.
+
diff --git a/doc/examples.md b/doc/examples.md
index a9800ad4b..f5fc188dc 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -2,14 +2,30 @@
 
 [TOC]
 
+| Example | Description |
+|---------|-------------|
+| @ref ex1 | A short introduction to **nda** |
+| @ref ex2 | Different ways to construct and get started with arrays |
+| @ref ex3 | Different ways to initialize already constructed arrays |
+| @ref ex4 | How to take views and slices of arrays |
+| @ref ex5 | How to write/read arrays and views to/from HDF5 files |
+| @ref ex6 | How to broadcast, gather, scatter and reduce arrays and views |
+| @ref ex7 | How to use symmetries with **nda** arrays |
+| @ref ex8 | Doing linear algebra with **nda** arrys |
+
 @section compiling Compiling the examples
 
-All examples have been compiled on a MacBook Pro with an Apple M2 Max chip and [HDF5](https://www.hdfgroup.org/solutions/hdf5/) 1.14.3
+All examples have been compiled on a MacBook Pro with an Apple M2 Max chip with
+- [HDF5](https://www.hdfgroup.org/solutions/hdf5/) 1.14.3,
+- [open-mpi](https://www.open-mpi.org/) 5.0.1 and
+- [OpenBLAS](https://www.openblas.net/) 0.3.27
+
 installed via [homebrew](https://brew.sh/).
-We further used clang 17.0.6 together with cmake 3.29.1.
 
-Assuming that **nda** has been installed locally (see @ref installation) and that the actual example code is in a file `main.cpp`,
-the following generic `CMakeLists.txt` should work for all examples (see also @ref integration):
+We further used clang 19.1.2 together with cmake 3.30.5.
+
+Assuming that **nda** has been installed locally (see @ref installation) and that the actual example code is in a file
+`main.cpp`, the following generic `CMakeLists.txt` should work for all examples (see also @ref integration):
 
 ```cmake
 cmake_minimum_required(VERSION 3.20)
diff --git a/doc/groups.dox b/doc/groups.dox
index b1db31efd..c476d8e15 100644
--- a/doc/groups.dox
+++ b/doc/groups.dox
@@ -210,27 +210,26 @@
  * nda::make_regular:
  *
  * @code{.cpp}
- * // create two arrays A and B
+ * // create an array A
  * auto A = nda::array<int, 2>({{1, 2}, {3, 4}});
- * auto B = nda::array<int, 2>({{5, 6}, {7, 8}});
  *
- * // add them elementwise
- * auto ex = A + B; // ex is an nda::expr object
+ * // square the elements of A
+ * auto ex = nda::pow(A, 2); // ex is a lazy expression
  *
  * // evaluate the lazy expression by constructing a new array
- * nda::array<int, 2> C = ex;
+ * nda::array<int, 2> A_sq = ex;
+ * std::cout << A_sq << std::endl;
  *
  * // evaluate the lazy expression using nda::make_regular
- * auto D = nda::make_regular(ex);
+ * std::cout << nda::make_regular(ex) << std::endl;
  * @endcode
  *
  * Output:
  *
  * ```
- * C:
  * [[1,4]
  *  [9,16]]
- * D:
+ *
  * [[1,4]
  *  [9,16]]
  * ```
@@ -289,7 +288,7 @@
  *   std::cout << A << std::endl;
  *   std::cout << B << std::endl;
  *
- *   // get representative dat (should be a vector of size 6)
+ *   // get representative data (should be a vector of size 6)
  *   auto vec = grp.get_representative_data(A);
  *   std::cout << "\n" << nda::array_view<std::complex<double>, 1>(vec) << std::endl;
  * }