diff --git a/Project.toml b/Project.toml
index 3d89691c..0efa988f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ReservoirComputing"
 uuid = "7c2d2b1e-3dd4-11ea-355a-8f6a8116e294"
 authors = ["Francesco Martinuzzi"]
-version = "0.9.4"
+version = "0.9.5"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/README.md b/README.md
index 8a30afa8..9172725e 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,13 @@
 
 [![Join the chat at https://julialang.zulipchat.com #sciml-bridged](https://img.shields.io/static/v1?label=Zulip&message=chat&color=9558b2&labelColor=389826)](https://julialang.zulipchat.com/#narrow/stream/279055-sciml-bridged)
 [![Global Docs](https://img.shields.io/badge/docs-SciML-blue.svg)](https://docs.sciml.ai/ReservoirComputing/stable/)
- [![arXiv](https://img.shields.io/badge/arXiv-2204.05117-00b300.svg)](https://arxiv.org/abs/2204.05117)
+[![arXiv](https://img.shields.io/badge/arXiv-2204.05117-00b300.svg)](https://arxiv.org/abs/2204.05117)
 
 [![codecov](https://codecov.io/gh/SciML/ReservoirComputing.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/SciML/ReservoirComputing.jl)
 [![Build Status](https://github.com/SciML/ReservoirComputing.jl/workflows/CI/badge.svg)](https://github.com/SciML/ReservoirComputing.jl/actions?query=workflow%3ACI)
 [![Build status](https://badge.buildkite.com/db8f91b89a10ad79bbd1d9fdb1340e6f6602a1c0ed9496d4d0.svg)](https://buildkite.com/julialang/reservoircomputing-dot-jl)
 
-[![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
+[![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor%27s%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
 [![SciML Code Style](https://img.shields.io/static/v1?label=code%20style&message=SciML&color=9558b2&labelColor=389826)](https://github.com/SciML/SciMLStyle)
 
 ![rc_full_logo_large_white_cropped](https://user-images.githubusercontent.com/10376688/144242116-8243f58a-5ac6-4e0e-88d5-3409f00e20b4.png)
@@ -23,58 +23,68 @@ To illustrate the workflow of this library we will showcase how it is possible t
 using ReservoirComputing, OrdinaryDiffEq
 
 #lorenz system parameters
-u0 = [1.0,0.0,0.0]                       
-tspan = (0.0,200.0)                      
-p = [10.0,28.0,8/3]
+u0 = [1.0, 0.0, 0.0]
+tspan = (0.0, 200.0)
+p = [10.0, 28.0, 8 / 3]
 
 #define lorenz system
-function lorenz(du,u,p,t)
-    du[1] = p[1]*(u[2]-u[1])
-    du[2] = u[1]*(p[2]-u[3]) - u[2]
-    du[3] = u[1]*u[2] - p[3]*u[3]
+function lorenz(du, u, p, t)
+    du[1] = p[1] * (u[2] - u[1])
+    du[2] = u[1] * (p[2] - u[3]) - u[2]
+    du[3] = u[1] * u[2] - p[3] * u[3]
 end
 #solve and take data
-prob = ODEProblem(lorenz, u0, tspan, p)  
-data = solve(prob, ABM54(), dt=0.02)   
+prob = ODEProblem(lorenz, u0, tspan, p)
+data = solve(prob, ABM54(), dt = 0.02)
 
 shift = 300
 train_len = 5000
 predict_len = 1250
 
 #one step ahead for generative prediction
-input_data = data[:, shift:shift+train_len-1]
-target_data = data[:, shift+1:shift+train_len]
+input_data = data[:, shift:(shift + train_len - 1)]
+target_data = data[:, (shift + 1):(shift + train_len)]
 
-test = data[:,shift+train_len:shift+train_len+predict_len-1]
+test = data[:, (shift + train_len):(shift + train_len + predict_len - 1)]
 ```
+
 Now that we have the data we can initialize the ESN with the chosen parameters. Given that this is a quick example we are going to change the least amount of possible parameters. For more detailed examples and explanations of the functions please refer to the documentation.
+
 ```julia
 res_size = 300
-esn = ESN(input_data; 
-          reservoir = RandSparseReservoir(res_size, radius=1.2, sparsity=6/res_size),
-          input_layer = WeightedLayer(),
-          nla_type = NLAT2())
+esn = ESN(input_data;
+    reservoir = RandSparseReservoir(res_size, radius = 1.2, sparsity = 6 / res_size),
+    input_layer = WeightedLayer(),
+    nla_type = NLAT2())
 ```
 
 The echo state network can now be trained and tested. If not specified, the training will always be Ordinary Least Squares regression. The full range of training methods is detailed in the documentation.
+
 ```julia
 output_layer = train(esn, target_data)
 output = esn(Generative(predict_len), output_layer)
 ```
 
 The data is returned as a matrix, `output` in the code above, that contains the predicted trajectories. The results can now be easily plotted (for the actual script used to obtain this plot please refer to the documentation):
+
 ```julia
 using Plots
-plot(transpose(output),layout=(3,1), label="predicted")
-plot!(transpose(test),layout=(3,1), label="actual")
+plot(transpose(output), layout = (3, 1), label = "predicted")
+plot!(transpose(test), layout = (3, 1), label = "actual")
 ```
+
 ![lorenz_basic](https://user-images.githubusercontent.com/10376688/166227371-8bffa318-5c49-401f-9c64-9c71980cb3f7.png)
 
 One can also visualize the phase space of the attractor and the comparison with the actual one:
+
 ```julia
-plot(transpose(output)[:,1], transpose(output)[:,2], transpose(output)[:,3], label="predicted")
-plot!(transpose(test)[:,1], transpose(test)[:,2], transpose(test)[:,3], label="actual")
+plot(transpose(output)[:, 1],
+    transpose(output)[:, 2],
+    transpose(output)[:, 3],
+    label = "predicted")
+plot!(transpose(test)[:, 1], transpose(test)[:, 2], transpose(test)[:, 3], label = "actual")
 ```
+
 ![lorenz_attractor](https://user-images.githubusercontent.com/10376688/81470281-5a34b580-91ea-11ea-9eea-d2b266da19f4.png)
 
 ## Citing
diff --git a/docs/make.jl b/docs/make.jl
index 9fa9ec58..7d697952 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -8,13 +8,12 @@ ENV["GKSwstype"] = "100"
 include("pages.jl")
 
 makedocs(modules = [ReservoirComputing],
-         sitename = "ReservoirComputing.jl",
-         clean = true, doctest = false, linkcheck = true,
-         warnonly = [:missing_docs],
-         format = Documenter.HTML(
-                                  assets = ["assets/favicon.ico"],
-                                  canonical = "https://docs.sciml.ai/ReservoirComputing/stable/"),
-         pages = pages)
+    sitename = "ReservoirComputing.jl",
+    clean = true, doctest = false, linkcheck = true,
+    warnonly = [:missing_docs],
+    format = Documenter.HTML(assets = ["assets/favicon.ico"],
+        canonical = "https://docs.sciml.ai/ReservoirComputing/stable/"),
+    pages = pages)
 
 deploydocs(repo = "github.com/SciML/ReservoirComputing.jl.git";
-           push_preview = true)
+    push_preview = true)
diff --git a/docs/pages.jl b/docs/pages.jl
index fb96f752..309f76fa 100644
--- a/docs/pages.jl
+++ b/docs/pages.jl
@@ -1,21 +1,21 @@
 pages = [
     "ReservoirComputing.jl" => "index.md",
     "General Settings" => Any["Changing Training Algorithms" => "general/different_training.md",
-                              "Altering States" => "general/states_variation.md",
-                              "Generative vs Predictive" => "general/predictive_generative.md"],
+        "Altering States" => "general/states_variation.md",
+        "Generative vs Predictive" => "general/predictive_generative.md"],
     "Echo State Network Tutorials" => Any["Lorenz System Forecasting" => "esn_tutorials/lorenz_basic.md",
-                                          #"Mackey-Glass Forecasting on GPU" => "esn_tutorials/mackeyglass_basic.md",
-                                          "Using Different Layers" => "esn_tutorials/change_layers.md",
-                                          "Using Different Reservoir Drivers" => "esn_tutorials/different_drivers.md",
-                                          #"Using Different Training Methods" => "esn_tutorials/different_training.md",
-                                          "Deep Echo State Networks" => "esn_tutorials/deep_esn.md",
-                                          "Hybrid Echo State Networks" => "esn_tutorials/hybrid.md"],
+        #"Mackey-Glass Forecasting on GPU" => "esn_tutorials/mackeyglass_basic.md",
+        "Using Different Layers" => "esn_tutorials/change_layers.md",
+        "Using Different Reservoir Drivers" => "esn_tutorials/different_drivers.md",
+        #"Using Different Training Methods" => "esn_tutorials/different_training.md",
+        "Deep Echo State Networks" => "esn_tutorials/deep_esn.md",
+        "Hybrid Echo State Networks" => "esn_tutorials/hybrid.md"],
     "Reservoir Computing with Cellular Automata" => "reca_tutorials/reca.md",
     "API Documentation" => Any["Training Algorithms" => "api/training.md",
-                               "States Modifications" => "api/states.md",
-                               "Prediction Types" => "api/predict.md",
-                               "Echo State Networks" => "api/esn.md",
-                               "ESN Layers" => "api/esn_layers.md",
-                               "ESN Drivers" => "api/esn_drivers.md",
-                               "ReCA" => "api/reca.md"],
+        "States Modifications" => "api/states.md",
+        "Prediction Types" => "api/predict.md",
+        "Echo State Networks" => "api/esn.md",
+        "ESN Layers" => "api/esn_layers.md",
+        "ESN Drivers" => "api/esn_drivers.md",
+        "ReCA" => "api/reca.md"],
 ]
diff --git a/docs/src/api/esn.md b/docs/src/api/esn.md
index ff03dc8c..1caacdd1 100644
--- a/docs/src/api/esn.md
+++ b/docs/src/api/esn.md
@@ -1,16 +1,28 @@
 # Echo State Networks
+
+The core component of an ESN is the `ESN` type. It represents the entire Echo State Network and includes parameters for configuring the reservoir, input scaling, and output weights. Here's the documentation for the `ESN` type:
+
 ```@docs
     ESN
 ```
 
-In addition to all the components that can be explored in the documentation, a couple components need a separate introduction. The ```variation``` arguments can be
+## Variations
+
+In addition to the standard `ESN` model, there are variations that allow for deeper customization of the underlying model. Currently, there are two available variations: `Default` and `Hybrid`. These variations provide different ways to configure the ESN. Here's the documentation for the variations:
+
 ```@docs
     Default
     Hybrid
 ```
 
-These arguments detail a deeper variation of the underlying model, and they need a separate call. For the moment, the most complex is the ```Hybrid``` call, but this can and will change in the future.
-All ESN models can be trained using the following call:
+The `Hybrid` variation is the most complex option and offers additional customization. Note that more variations may be added in the future to provide even greater flexibility.
+
+## Training
+
+To train an ESN model, you can use the `train` function. It takes the ESN model, training data, and other optional parameters as input and returns a trained model. Here's the documentation for the train function:
+
 ```@docs
     train
 ```
+
+With these components and variations, you can configure and train ESN models for various time series and sequential data prediction tasks.
diff --git a/docs/src/api/esn_drivers.md b/docs/src/api/esn_drivers.md
index a11ec35b..0bf0a388 100644
--- a/docs/src/api/esn_drivers.md
+++ b/docs/src/api/esn_drivers.md
@@ -1,13 +1,16 @@
 # ESN Drivers
+
 ```@docs
     RNN
     MRNN
     GRU
 ```
-The ```GRU``` driver also provides the user with the choice of the possible variants:
+
+The `GRU` driver also provides the user with the choice of the possible variants:
+
 ```@docs
     FullyGated
     Minimal
 ```
-Please refer to the original papers for more detail about these architectures.
 
+Please refer to the original papers for more detail about these architectures.
diff --git a/docs/src/api/esn_layers.md b/docs/src/api/esn_layers.md
index 76be5268..bb3b53de 100644
--- a/docs/src/api/esn_layers.md
+++ b/docs/src/api/esn_layers.md
@@ -1,6 +1,7 @@
 # ESN Layers
 
 ## Input Layers
+
 ```@docs
     WeightedLayer
     DenseLayer
@@ -9,16 +10,22 @@
     MinimumLayer
     NullLayer
 ```
-The signs in the ```MinimumLayer``` are chosen based on the following methods:
+
+The signs in the `MinimumLayer` are chosen based on the following methods:
+
 ```@docs
     BernoulliSample
     IrrationalSample
 ```
+
 To derive the matrix one can call the following function:
+
 ```@docs
     create_layer
 ```
-To create new input layers, it suffices to define a new struct containing the needed parameters of the new input layer. This struct will need to be an ```AbstractLayer```, so the ```create_layer``` function can be dispatched over it. The workflow should follow this snippet:
+
+To create new input layers, it suffices to define a new struct containing the needed parameters of the new input layer. This struct will need to be an `AbstractLayer`, so the `create_layer` function can be dispatched over it. The workflow should follow this snippet:
+
 ```julia
 #creation of the new struct for the layer
 struct MyNewLayer <: AbstractLayer
@@ -32,6 +39,7 @@ end
 ```
 
 ## Reservoirs
+
 ```@docs
     RandSparseReservoir
     PseudoSVDReservoir
@@ -43,11 +51,13 @@ end
 ```
 
 Like for the input layers, to actually build the matrix of the reservoir, one can call the following function:
+
 ```@docs
     create_reservoir
 ```
 
-To create a new reservoir, the procedure is similar to the one for the input layers. First, the definition of the new struct of type ```AbstractReservoir``` with the reservoir parameters is needed. Then the dispatch over the ```create_reservoir``` function makes the model actually build the reservoir matrix. An example of the workflow is given in the following snippet:
+To create a new reservoir, the procedure is similar to the one for the input layers. First, the definition of the new struct of type `AbstractReservoir` with the reservoir parameters is needed. Then the dispatch over the `create_reservoir` function makes the model actually build the reservoir matrix. An example of the workflow is given in the following snippet:
+
 ```julia
 #creation of the new struct for the reservoir
 struct MyNewReservoir <: AbstractReservoir
diff --git a/docs/src/api/predict.md b/docs/src/api/predict.md
index cf3699a4..5409d78e 100644
--- a/docs/src/api/predict.md
+++ b/docs/src/api/predict.md
@@ -1,4 +1,5 @@
 # Prediction Types
+
 ```@docs
     Generative
     Predictive
diff --git a/docs/src/api/reca.md b/docs/src/api/reca.md
index dcbc86df..48f134dd 100644
--- a/docs/src/api/reca.md
+++ b/docs/src/api/reca.md
@@ -1,11 +1,13 @@
 # Reservoir Computing with Cellular Automata
+
 ```@docs
     RECA
 ```
 
 The input encodings are the equivalent of the input matrices of the ESNs. These are the available encodings:
+
 ```@docs
     RandomMapping
 ```
 
-The training and prediction follow the same workflow as the ESN. It is important to note that currently we were unable to find any papers using these models with a ```Generative``` approach for the prediction, so full support is given only to the ```Predictive``` method.
+The training and prediction follow the same workflow as the ESN. It is important to note that currently we were unable to find any papers using these models with a `Generative` approach for the prediction, so full support is given only to the `Predictive` method.
diff --git a/docs/src/api/states.md b/docs/src/api/states.md
index 282863e8..5a4e5686 100644
--- a/docs/src/api/states.md
+++ b/docs/src/api/states.md
@@ -1,6 +1,7 @@
 # States Modifications
 
 ## Padding and Estension
+
 ```@docs
     StandardStates
     ExtendedStates
@@ -9,6 +10,7 @@
 ```
 
 ## Non Linear Transformations
+
 ```@docs
     NLADefault
     NLAT1
diff --git a/docs/src/api/training.md b/docs/src/api/training.md
index b34f046b..a11b2034 100644
--- a/docs/src/api/training.md
+++ b/docs/src/api/training.md
@@ -1,13 +1,16 @@
 # Training Algorithms
 
 ## Linear Models
+
 ```@docs
     StandardRidge
     LinearModel
 ```
 
 ## Gaussian Regression
+
 Currently, v0.9 is unavailable.
 
 ## Support Vector Regression
-Support Vector Regression is possible using a direct call to [LIBSVM](https://github.com/JuliaML/LIBSVM.jl) regression methods. Instead of a wrapper, please refer to the use of ```LIBSVM.AbstractSVR``` in the original library.
+
+Support Vector Regression is possible using a direct call to [LIBSVM](https://github.com/JuliaML/LIBSVM.jl) regression methods. Instead of a wrapper, please refer to the use of `LIBSVM.AbstractSVR` in the original library.
diff --git a/docs/src/esn_tutorials/change_layers.md b/docs/src/esn_tutorials/change_layers.md
index 0a659cd5..5cfb65cf 100644
--- a/docs/src/esn_tutorials/change_layers.md
+++ b/docs/src/esn_tutorials/change_layers.md
@@ -1,13 +1,16 @@
 # Using Different Layers
+
 A great deal of effort in the ESNs field is devoted to finding the ideal construction for the reservoir matrices. With a simple interface using ReservoirComputing.jl it is possible to leverage the currently implemented matrix construction methods for both the reservoir and the input layer. On this page, it is showcased how it is possible to change both of these layers.
 
 The `input_init` keyword argument provided with the `ESN` constructor allows for changing the input layer. The layers provided in ReservoirComputing.jl are the following:
-- ```WeightedLayer(scaling)```
-- ```DenseLayer(scaling)```
-- ```SparseLayer(scaling, sparsity)```
-- ```MinimumLayer(weight, sampling)```
-- ```InformedLayer(model_in_size; scaling=0.1, gamma=0.5)```
-In addition, the user can define a custom layer following this workflow:
+
+  - `WeightedLayer(scaling)`
+  - `DenseLayer(scaling)`
+  - `SparseLayer(scaling, sparsity)`
+  - `MinimumLayer(weight, sampling)`
+  - `InformedLayer(model_in_size; scaling=0.1, gamma=0.5)`
+    In addition, the user can define a custom layer following this workflow:
+
 ```julia
 #creation of the new struct for the layer
 struct MyNewLayer <: AbstractLayer
@@ -19,14 +22,17 @@ function create_layer(input_layer::MyNewLayer, res_size, in_size)
     #the new algorithm to build the input layer goes here
 end
 ```
+
 Similarly the `reservoir_init` keyword argument provides the possibility to change the construction for the reservoir matrix. The available reservoir are:
-- ```RandSparseReservoir(res_size, radius, sparsity)```
-- ```PseudoSVDReservoir(res_size, max_value, sparsity, sorted, reverse_sort)```
-- ```DelayLineReservoir(res_size, weight)```
-- ```DelayLineBackwardReservoir(res_size, weight, fb_weight)```
-- ```SimpleCycleReservoir(res_size, weight)```
-- ```CycleJumpsReservoir(res_size, cycle_weight, jump_weight, jump_size)```
-And, like before, it is possible to build a custom reservoir by following this workflow:
+
+  - `RandSparseReservoir(res_size, radius, sparsity)`
+  - `PseudoSVDReservoir(res_size, max_value, sparsity, sorted, reverse_sort)`
+  - `DelayLineReservoir(res_size, weight)`
+  - `DelayLineBackwardReservoir(res_size, weight, fb_weight)`
+  - `SimpleCycleReservoir(res_size, weight)`
+  - `CycleJumpsReservoir(res_size, cycle_weight, jump_weight, jump_size)`
+    And, like before, it is possible to build a custom reservoir by following this workflow:
+
 ```julia
 #creation of the new struct for the reservoir
 struct MyNewReservoir <: AbstractReservoir
@@ -40,9 +46,11 @@ end
 ```
 
 ## Example of a minimally complex ESN
+
 Using [^1] and [^2] as references, this section will provide an example of how to change both the input layer and the reservoir for ESNs. The full script for this example can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_layers/layers.jl). This example was run on Julia v1.7.2.
 
 The task for this example will be the one step ahead prediction of the Henon map. To obtain the data, one can leverage the package [DynamicalSystems.jl](https://juliadynamics.github.io/DynamicalSystems.jl/dev/). The data is scaled to be between -1 and 1.
+
 ```@example mesn
 using PredefinedDynamicalSystems
 train_len = 3000
@@ -51,13 +59,13 @@ predict_len = 2000
 ds = PredefinedDynamicalSystems.henon()
 traj, time = trajectory(ds, 7000)
 data = Matrix(traj)'
-data = (data .-0.5) .* 2
+data = (data .- 0.5) .* 2
 shift = 200
 
-training_input = data[:, shift:shift+train_len-1]
-training_target = data[:, shift+1:shift+train_len]
-testing_input = data[:,shift+train_len:shift+train_len+predict_len-1]
-testing_target = data[:,shift+train_len+1:shift+train_len+predict_len]
+training_input = data[:, shift:(shift + train_len - 1)]
+training_target = data[:, (shift + 1):(shift + train_len)]
+testing_input = data[:, (shift + train_len):(shift + train_len + predict_len - 1)]
+testing_target = data[:, (shift + train_len + 1):(shift + train_len + predict_len)]
 ```
 
 Now it is possible to define the input layers and reservoirs we want to compare and run the comparison in a simple for loop. The accuracy will be tested using the mean squared deviation `msd` from [StatsBase](https://juliastats.org/StatsBase.jl/stable/).
@@ -66,11 +74,14 @@ Now it is possible to define the input layers and reservoirs we want to compare
 using ReservoirComputing, StatsBase
 
 res_size = 300
-input_layer = [MinimumLayer(0.85, IrrationalSample()), MinimumLayer(0.95, IrrationalSample())]
-reservoirs = [SimpleCycleReservoir(res_size, 0.7), 
-     CycleJumpsReservoir(res_size, cycle_weight=0.7, jump_weight=0.2, jump_size=5)]
+input_layer = [
+    MinimumLayer(0.85, IrrationalSample()),
+    MinimumLayer(0.95, IrrationalSample()),
+]
+reservoirs = [SimpleCycleReservoir(res_size, 0.7),
+    CycleJumpsReservoir(res_size, cycle_weight = 0.7, jump_weight = 0.2, jump_size = 5)]
 
-for i=1:length(reservoirs)
+for i in 1:length(reservoirs)
     esn = ESN(training_input;
         input_layer = input_layer[i],
         reservoir = reservoirs[i])
@@ -79,11 +90,10 @@ for i=1:length(reservoirs)
     println(msd(testing_target, output))
 end
 ```
-As it is possible to see, changing layers in ESN models is straightforward. Be sure to check the API documentation for a full list of reservoirs and layers.
 
+As it is possible to see, changing layers in ESN models is straightforward. Be sure to check the API documentation for a full list of reservoirs and layers.
 
 ## Bibliography
-[^1]: Rodan, Ali, and Peter Tiňo. “Simple deterministically constructed cycle reservoirs with regular jumps.” Neural computation 24.7 (2012): 1822-1852.
 
+[^1]: Rodan, Ali, and Peter Tiňo. “Simple deterministically constructed cycle reservoirs with regular jumps.” Neural computation 24.7 (2012): 1822-1852.
 [^2]: Rodan, Ali, and Peter Tiňo. “Minimum complexity echo state network.” IEEE transactions on neural networks 22.1 (2010): 131-144.
-
diff --git a/docs/src/esn_tutorials/deep_esn.md b/docs/src/esn_tutorials/deep_esn.md
index 6619f722..bd4f7b46 100644
--- a/docs/src/esn_tutorials/deep_esn.md
+++ b/docs/src/esn_tutorials/deep_esn.md
@@ -1,24 +1,26 @@
 # Deep Echo State Networks
 
-Deep Echo State Network architectures started to gain some traction recently. In this guide, we illustrate how it is possible to use ReservoirComputing.jl to build a deep ESN. 
+Deep Echo State Network architectures started to gain some traction recently. In this guide, we illustrate how it is possible to use ReservoirComputing.jl to build a deep ESN.
 
 The network implemented in this library is taken from [^1]. It works by stacking reservoirs on top of each other, feeding the output from one into the next. The states are obtained by merging all the inner states of the stacked reservoirs. For a more in-depth explanation, refer to the paper linked above. The full script for this example can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/deep-esn/deepesn.jl). This example was run on Julia v1.7.2.
 
 ## Lorenz Example
+
 For this example, we are going to reuse the Lorenz data used in the [Lorenz System Forecasting](@ref) example.
+
 ```@example deep_lorenz
 using OrdinaryDiffEq
 
 #define lorenz system
-function lorenz!(du,u,p,t)
-    du[1] = 10.0*(u[2]-u[1])
-    du[2] = u[1]*(28.0-u[3]) - u[2]
-    du[3] = u[1]*u[2] - (8/3)*u[3]
+function lorenz!(du, u, p, t)
+    du[1] = 10.0 * (u[2] - u[1])
+    du[2] = u[1] * (28.0 - u[3]) - u[2]
+    du[3] = u[1] * u[2] - (8 / 3) * u[3]
 end
 
 #solve and take data
-prob = ODEProblem(lorenz!, [1.0,0.0,0.0], (0.0,200.0))
-data = solve(prob, ABM54(), dt=0.02)
+prob = ODEProblem(lorenz!, [1.0, 0.0, 0.0], (0.0, 200.0))
+data = solve(prob, ABM54(), dt = 0.02)
 
 #determine shift length, training length and prediction length
 shift = 300
@@ -26,22 +28,23 @@ train_len = 5000
 predict_len = 1250
 
 #split the data accordingly
-input_data = data[:, shift:shift+train_len-1]
-target_data = data[:, shift+1:shift+train_len]
-test_data = data[:,shift+train_len+1:shift+train_len+predict_len]
+input_data = data[:, shift:(shift + train_len - 1)]
+target_data = data[:, (shift + 1):(shift + train_len)]
+test_data = data[:, (shift + train_len + 1):(shift + train_len + predict_len)]
 ```
 
-Again, it is *important* to notice that the data needs to be formatted in a matrix, with the features as rows and time steps as columns, as in this example. This is needed even if the time series consists of single values. 
+Again, it is *important* to notice that the data needs to be formatted in a matrix, with the features as rows and time steps as columns, as in this example. This is needed even if the time series consists of single values.
+
+The construction of the ESN is also really similar. The only difference is that the reservoir can be fed as an array of reservoirs.
 
-The construction of the ESN is also really similar. The only difference is that the reservoir can be fed as an array of reservoirs. 
 ```@example deep_lorenz
 using ReservoirComputing
 
-reservoirs = [RandSparseReservoir(99, radius=1.1, sparsity=0.1),
-    RandSparseReservoir(100, radius=1.2, sparsity=0.1),
-    RandSparseReservoir(200, radius=1.4, sparsity=0.1)]
+reservoirs = [RandSparseReservoir(99, radius = 1.1, sparsity = 0.1),
+    RandSparseReservoir(100, radius = 1.2, sparsity = 0.1),
+    RandSparseReservoir(200, radius = 1.4, sparsity = 0.1)]
 
-esn = ESN(input_data; 
+esn = ESN(input_data;
     variation = Default(),
     reservoir = reservoirs,
     input_layer = DenseLayer(),
@@ -55,35 +58,39 @@ As it is possible to see, different sizes can be chosen for the different reserv
 In addition to using the provided functions for the construction of the layers, the user can also choose to build their own matrix, or array of matrices, and feed that into the `ESN` in the same way.
 
 The training and prediction follow the usual framework:
+
 ```@example deep_lorenz
-training_method = StandardRidge(0.0) 
+training_method = StandardRidge(0.0)
 output_layer = train(esn, target_data, training_method)
 
 output = esn(Generative(predict_len), output_layer)
 ```
+
 Plotting the results:
+
 ```@example deep_lorenz
 using Plots
 
 ts = 0.0:0.02:200.0
 lorenz_maxlyap = 0.9056
-predict_ts = ts[shift+train_len+1:shift+train_len+predict_len]
-lyap_time = (predict_ts .- predict_ts[1])*(1/lorenz_maxlyap)
-
-p1 = plot(lyap_time, [test_data[1,:] output[1,:]], label = ["actual" "predicted"], 
-    ylabel = "x(t)", linewidth=2.5, xticks=false, yticks = -15:15:15);
-p2 = plot(lyap_time, [test_data[2,:] output[2,:]], label = ["actual" "predicted"], 
-    ylabel = "y(t)", linewidth=2.5, xticks=false, yticks = -20:20:20);
-p3 = plot(lyap_time, [test_data[3,:] output[3,:]], label = ["actual" "predicted"], 
-    ylabel = "z(t)", linewidth=2.5, xlabel = "max(λ)*t", yticks = 10:15:40);
-
-
-plot(p1, p2, p3, plot_title = "Lorenz System Coordinates", 
-    layout=(3,1), xtickfontsize = 12, ytickfontsize = 12, xguidefontsize=15, yguidefontsize=15,
-    legendfontsize=12, titlefontsize=20)
+predict_ts = ts[(shift + train_len + 1):(shift + train_len + predict_len)]
+lyap_time = (predict_ts .- predict_ts[1]) * (1 / lorenz_maxlyap)
+
+p1 = plot(lyap_time, [test_data[1, :] output[1, :]], label = ["actual" "predicted"],
+    ylabel = "x(t)", linewidth = 2.5, xticks = false, yticks = -15:15:15);
+p2 = plot(lyap_time, [test_data[2, :] output[2, :]], label = ["actual" "predicted"],
+    ylabel = "y(t)", linewidth = 2.5, xticks = false, yticks = -20:20:20);
+p3 = plot(lyap_time, [test_data[3, :] output[3, :]], label = ["actual" "predicted"],
+    ylabel = "z(t)", linewidth = 2.5, xlabel = "max(λ)*t", yticks = 10:15:40);
+
+plot(p1, p2, p3, plot_title = "Lorenz System Coordinates",
+    layout = (3, 1), xtickfontsize = 12, ytickfontsize = 12, xguidefontsize = 15,
+    yguidefontsize = 15,
+    legendfontsize = 12, titlefontsize = 20)
 ```
 
 Note that there is a known bug at the moment with using `WeightedLayer` as the input layer with the deep ESN. We are in the process of investigating and solving it. The leak coefficient for the reservoirs has to always be the same in the current implementation. This is also something we are actively looking into expanding.
 
 ## Documentation
+
 [^1]: Gallicchio, Claudio, and Alessio Micheli. "_Deep echo state network (deepesn): A brief survey._" arXiv preprint arXiv:1712.04323 (2017).
diff --git a/docs/src/esn_tutorials/different_drivers.md b/docs/src/esn_tutorials/different_drivers.md
index 9b9fcd55..0a669451 100644
--- a/docs/src/esn_tutorials/different_drivers.md
+++ b/docs/src/esn_tutorials/different_drivers.md
@@ -1,41 +1,51 @@
 # Using Different Reservoir Drivers
+
 While the original implementation of the Echo State Network implemented the model using the equations of Recurrent Neural Networks to obtain non-linearity in the reservoir, other variations have been proposed in recent years. More specifically, the different drivers implemented in ReservoirComputing.jl are the multiple activation function RNN `MRNN()` and the Gated Recurrent Unit `GRU()`. To change them, it suffices to give the chosen method to the `ESN` keyword argument `reservoir_driver`. In this section, some examples, of their usage will be given, as well as a brief introduction to their equations.
 
 ## Multiple Activation Function RNN
+
 Based on the double activation function ESN (DAFESN) proposed in [^1], the Multiple Activation Function ESN expands the idea and allows a custom number of activation functions to be used in the reservoir dynamics. This can be thought of as a linear combination of multiple activation functions with corresponding parameters.
+
 ```math
 \mathbf{x}(t+1) = (1-\alpha)\mathbf{x}(t) + \lambda_1 f_1(\mathbf{W}\mathbf{x}(t)+\mathbf{W}_{in}\mathbf{u}(t)) + \dots + \lambda_D f_D(\mathbf{W}\mathbf{x}(t)+\mathbf{W}_{in}\mathbf{u}(t))
 ```
+
 where ``D`` is the number of activation functions and respective parameters chosen.
 
-The method to call to use the multiple activation function ESN is `MRNN(activation_function, leaky_coefficient, scaling_factor)`. The arguments can be used as both `args` and `kwargs`. `activation_function` and `scaling_factor` have to be vectors (or tuples) containing the chosen activation functions and respective scaling factors (``f_1,...,f_D`` and ``\lambda_1,...,\lambda_D`` following the nomenclature introduced above). The `leaky_coefficient` represents ``\alpha`` and it is a single value. 
+The method to call to use the multiple activation function ESN is `MRNN(activation_function, leaky_coefficient, scaling_factor)`. The arguments can be used as both `args` and `kwargs`. `activation_function` and `scaling_factor` have to be vectors (or tuples) containing the chosen activation functions and respective scaling factors (``f_1,...,f_D`` and ``\lambda_1,...,\lambda_D`` following the nomenclature introduced above). The `leaky_coefficient` represents ``\alpha`` and it is a single value.
 
 Starting with the example, the data used is based on the following function based on the DAFESN paper [^1]. A full script of the example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/mrnn/mrnn.jl). This example was run on Julia v1.7.2.
+
 ```@example mrnn
-u(t) = sin(t)+sin(0.51*t)+sin(0.22*t)+sin(0.1002*t)+sin(0.05343*t)
+u(t) = sin(t) + sin(0.51 * t) + sin(0.22 * t) + sin(0.1002 * t) + sin(0.05343 * t)
 ```
 
 For this example, the type of prediction will be one step ahead. The metric used to assure a good prediction will be the normalized root-mean-square deviation `rmsd` from [StatsBase](https://juliastats.org/StatsBase.jl/stable/). Like in the other examples, first it is needed to gather the data:
+
 ```@example mrnn
 train_len = 3000
 predict_len = 2000
 shift = 1
 
 data = u.(collect(0.0:0.01:500))
-training_input = reduce(hcat, data[shift:shift+train_len-1])
-training_target = reduce(hcat, data[shift+1:shift+train_len])
-testing_input = reduce(hcat, data[shift+train_len:shift+train_len+predict_len-1])
-testing_target = reduce(hcat, data[shift+train_len+1:shift+train_len+predict_len])
+training_input = reduce(hcat, data[shift:(shift + train_len - 1)])
+training_target = reduce(hcat, data[(shift + 1):(shift + train_len)])
+testing_input = reduce(hcat,
+    data[(shift + train_len):(shift + train_len + predict_len - 1)])
+testing_target = reduce(hcat,
+    data[(shift + train_len + 1):(shift + train_len + predict_len)])
 ```
 
 To follow the paper more closely, it is necessary to define a couple of activation functions. The numbering of them follows the ones in the paper. Of course, one can also use any custom-defined function, available in the base language or any activation function from [NNlib](https://fluxml.ai/Flux.jl/stable/models/nnlib/#Activation-Functions).
+
 ```@example mrnn
-f2(x) = (1-exp(-x))/(2*(1+exp(-x)))
-f3(x) = (2/pi)*atan((pi/2)*x)
-f4(x) = x/sqrt(1+x*x)
+f2(x) = (1 - exp(-x)) / (2 * (1 + exp(-x)))
+f3(x) = (2 / pi) * atan((pi / 2) * x)
+f4(x) = x / sqrt(1 + x * x)
 ```
 
 It is now possible to build different drivers, using the parameters suggested by the paper. Also, in this instance, the numbering follows the test cases of the paper. In the end, a simple for loop is implemented to compare the different drivers and activation functions.
+
 ```@example mrnn
 using ReservoirComputing, Random, StatsBase
 
@@ -47,14 +57,14 @@ base_case = RNN(tanh, 0.85)
 
 #MRNN() test cases
 #Parameter given as kwargs
-case3 = MRNN(activation_function=[tanh, f2], 
-    leaky_coefficient=0.85, 
-    scaling_factor=[0.5, 0.3])
+case3 = MRNN(activation_function = [tanh, f2],
+    leaky_coefficient = 0.85,
+    scaling_factor = [0.5, 0.3])
 
 #Parameter given as kwargs
-case4 = MRNN(activation_function=[tanh, f3], 
-    leaky_coefficient=0.9, 
-    scaling_factor=[0.45, 0.35])
+case4 = MRNN(activation_function = [tanh, f3],
+    leaky_coefficient = 0.9,
+    scaling_factor = [0.45, 0.35])
 
 #Parameter given as args
 case5 = MRNN([tanh, f4], 0.9, [0.43, 0.13])
@@ -63,23 +73,26 @@ case5 = MRNN([tanh, f4], 0.9, [0.43, 0.13])
 test_cases = [base_case, case3, case4, case5]
 for case in test_cases
     esn = ESN(training_input,
-        input_layer = WeightedLayer(scaling=0.3),
-        reservoir = RandSparseReservoir(100, radius=0.4),
+        input_layer = WeightedLayer(scaling = 0.3),
+        reservoir = RandSparseReservoir(100, radius = 0.4),
         reservoir_driver = case,
         states_type = ExtendedStates())
     wout = train(esn, training_target, StandardRidge(10e-6))
     output = esn(Predictive(testing_input), wout)
-    println(rmsd(testing_target, output, normalize=true))
+    println(rmsd(testing_target, output, normalize = true))
 end
 ```
 
 In this example, it is also possible to observe the input of parameters to the methods `RNN()` `MRNN()`, both by argument and by keyword argument.
 
 ## Gated Recurrent Unit
+
 Gated Recurrent Units (GRUs) [^2] have been proposed in more recent years with the intent of limiting notable problems of RNNs, like the vanishing gradient. This change in the underlying equations can be easily transported into the Reservoir Computing paradigm, by switching the RNN equations in the reservoir with the GRU equations. This approach has been explored in [^3] and [^4]. Different variations of GRU have been proposed [^5][^6]; this section is subdivided into different sections that go into detail about the governing equations and the implementation of them into ReservoirComputing.jl. Like before, to access the GRU reservoir driver, it suffices to change the `reservoir_diver` keyword argument for `ESN` with `GRU()`. All the variations that will be presented can be used in this package by leveraging the keyword argument `variant` in the method `GRU()` and specifying the chosen variant: `FullyGated()` or `Minimal()`. Other variations are possible by modifying the inner layers and reservoirs. The default is set to the standard version `FullyGated()`. The first section will go into more detail about the default of the `GRU()` method, and the following ones will refer to it to minimize repetitions. This example was run on Julia v1.7.2.
 
 ### Standard GRU
+
 The equations for the standard GRU are as follows:
+
 ```math
 \mathbf{r}(t) = \sigma (\mathbf{W}^r_{\text{in}}\mathbf{u}(t)+\mathbf{W}^r\mathbf{x}(t-1)+\mathbf{b}_r) \\
 \mathbf{z}(t) = \sigma (\mathbf{W}^z_{\text{in}}\mathbf{u}(t)+\mathbf{W}^z\mathbf{x}(t-1)+\mathbf{b}_z) \\
@@ -87,19 +100,22 @@ The equations for the standard GRU are as follows:
 \mathbf{x}(t) = \mathbf{z}(t) \odot \mathbf{x}(t-1)+(1-\mathbf{z}(t)) \odot \tilde{\mathbf{x}}(t)
 ```
 
-Going over the `GRU` keyword argument, it will be explained how to feed the desired input to the model. 
- - `activation_function` is a vector with default values `[NNlib.sigmoid, NNlib.sigmoid, tanh]`. This argument controls the activation functions of the GRU, going from top to bottom. Changing the first element corresponds to changing the activation function for ``\mathbf{r}(t)`` and so on.
- - `inner_layer` is a vector with default values `fill(DenseLayer(), 2)`. This keyword argument controls the ``\mathbf{W}_{\text{in}}``s going from top to bottom like before.
- - `reservoir` is a vector with default value `fill(RandSparseReservoir(), 2)`. In a similar fashion to `inner_layer`, this keyword argument controls the reservoir matrix construction in a top to bottom order.
- - `bias` is again a vector with default value `fill(DenseLayer(), 2)`. It is meant to control the ``\mathbf{b}``s, going as usual from top to bottom.
- - `variant` controls the GRU variant. The default value is set to `FullyGated()`.
- 
-It is important to notice that `inner_layer` and `reservoir` control every layer except ``\mathbf{W}_{in}`` and ``\mathbf{W}`` and ``\mathbf{b}``. These arguments are given as input to the `ESN()` call as `input_layer`, `reservoir` and `bias`. 
+Going over the `GRU` keyword argument, it will be explained how to feed the desired input to the model.
+
+  - `activation_function` is a vector with default values `[NNlib.sigmoid, NNlib.sigmoid, tanh]`. This argument controls the activation functions of the GRU, going from top to bottom. Changing the first element corresponds to changing the activation function for ``\mathbf{r}(t)`` and so on.
+  - `inner_layer` is a vector with default values `fill(DenseLayer(), 2)`. This keyword argument controls the ``\mathbf{W}_{\text{in}}``s going from top to bottom like before.
+  - `reservoir` is a vector with default value `fill(RandSparseReservoir(), 2)`. In a similar fashion to `inner_layer`, this keyword argument controls the reservoir matrix construction in a top to bottom order.
+  - `bias` is again a vector with default value `fill(DenseLayer(), 2)`. It is meant to control the ``\mathbf{b}``s, going as usual from top to bottom.
+  - `variant` controls the GRU variant. The default value is set to `FullyGated()`.
+
+It is important to notice that `inner_layer` and `reservoir` control every layer except ``\mathbf{W}_{in}`` and ``\mathbf{W}`` and ``\mathbf{b}``. These arguments are given as input to the `ESN()` call as `input_layer`, `reservoir` and `bias`.
 
 The following sections are going to illustrate the variations of the GRU architecture and how to obtain them in ReservoirComputing.jl
 
 ### Type 1
+
 The first variation of the GRU is dependent only on the previous hidden state and the bias:
+
 ```math
 \mathbf{r}(t) = \sigma (\mathbf{W}^r\mathbf{x}(t-1)+\mathbf{b}_r) \\
 \mathbf{z}(t) = \sigma (\mathbf{W}^z\mathbf{x}(t-1)+\mathbf{b}_z) \\
@@ -108,7 +124,9 @@ The first variation of the GRU is dependent only on the previous hidden state an
 To obtain this variation, it will suffice to set `inner_layer = fill(NullLayer(), 2)` and leaving the `variant = FullyGated()`.
 
 ### Type 2
+
 The second variation only depends on the previous hidden state:
+
 ```math
 \mathbf{r}(t) = \sigma (\mathbf{W}^r\mathbf{x}(t-1)) \\
 \mathbf{z}(t) = \sigma (\mathbf{W}^z\mathbf{x}(t-1)) \\
@@ -117,7 +135,9 @@ The second variation only depends on the previous hidden state:
 Similarly to before, to obtain this variation, it is only required to set `inner_layer = fill(NullLayer(), 2)` and `bias = fill(NullLayer(), 2)` while keeping `variant = FullyGated()`.
 
 ### Type 3
+
 The final variation, before the minimal one, depends only on the biases
+
 ```math
 \mathbf{r}(t) = \sigma (\mathbf{b}_r) \\
 \mathbf{z}(t) = \sigma (\mathbf{b}_z) \\
@@ -125,8 +145,10 @@ The final variation, before the minimal one, depends only on the biases
 
 This means that it is only needed to set `inner_layer = fill(NullLayer(), 2)` and `reservoir = fill(NullReservoir(), 2)` while keeping `variant = FullyGated()`.
 
-### Minimal 
+### Minimal
+
 The minimal GRU variation merges two gates into one:
+
 ```math
 \mathbf{f}(t) = \sigma (\mathbf{W}^f_{\text{in}}\mathbf{u}(t)+\mathbf{W}^f\mathbf{x}(t-1)+\mathbf{b}_f) \\
 \tilde{\mathbf{x}}(t) = \text{tanh}(\mathbf{W}_{in}\mathbf{u}(t)+\mathbf{W}(\mathbf{f}(t) \odot \mathbf{x}(t-1))+\mathbf{b}) \\
@@ -136,9 +158,11 @@ The minimal GRU variation merges two gates into one:
 This variation can be obtained by setting `variation=Minimal()`. The `inner_layer`, `reservoir` and `bias` kwargs this time are **not** vectors, but must be defined like, for example `inner_layer = DenseLayer()` or `reservoir = SparseDenseReservoir()`.
 
 ### Examples
-To showcase the use of the `GRU()` method, this section will only illustrate the standard `FullyGated()` version. The full script for this example with the data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/gru/). 
 
-The data used for this example is the Santa Fe laser dataset [^7] retrieved from [here](https://web.archive.org/web/20160427182805/http://www-psych.stanford.edu/~andreas/Time-Series/SantaFe.html). The data is split to account for a next step prediction.
+To showcase the use of the `GRU()` method, this section will only illustrate the standard `FullyGated()` version. The full script for this example with the data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/change_drivers/gru/).
+
+The data used for this example is the Santa Fe laser dataset [^7] retrieved from [here](https://web.archive.org/web/20160427182805/http://www-psych.stanford.edu/%7Eandreas/Time-Series/SantaFe.html). The data is split to account for a next step prediction.
+
 ```@example gru
 using DelimitedFiles
 
@@ -148,12 +172,13 @@ train_len = 5000
 predict_len = 2000
 
 training_input = data[:, 1:train_len]
-training_target = data[:, 2:train_len+1]
-testing_input = data[:,train_len+1:train_len+predict_len]
-testing_target = data[:,train_len+2:train_len+predict_len+1]
+training_target = data[:, 2:(train_len + 1)]
+testing_input = data[:, (train_len + 1):(train_len + predict_len)]
+testing_target = data[:, (train_len + 2):(train_len + predict_len + 1)]
 ```
 
-The construction of the ESN proceeds as usual. 
+The construction of the ESN proceeds as usual.
+
 ```@example gru
 using ReservoirComputing, Random
 
@@ -161,22 +186,24 @@ res_size = 300
 res_radius = 1.4
 
 Random.seed!(42)
-esn = ESN(training_input; 
-    reservoir = RandSparseReservoir(res_size, radius=res_radius),
+esn = ESN(training_input;
+    reservoir = RandSparseReservoir(res_size, radius = res_radius),
     reservoir_driver = GRU())
 ```
 
 The default inner reservoir and input layer for the GRU are the same defaults for the `reservoir` and `input_layer` of the ESN. One can use the explicit call if they choose to.
+
 ```@example gru
-gru = GRU(reservoir=[RandSparseReservoir(res_size), 
-    RandSparseReservoir(res_size)],
-    inner_layer=[DenseLayer(), DenseLayer()])
-esn = ESN(training_input; 
-    reservoir = RandSparseReservoir(res_size, radius=res_radius),
+gru = GRU(reservoir = [RandSparseReservoir(res_size),
+        RandSparseReservoir(res_size)],
+    inner_layer = [DenseLayer(), DenseLayer()])
+esn = ESN(training_input;
+    reservoir = RandSparseReservoir(res_size, radius = res_radius),
     reservoir_driver = gru)
 ```
 
 The training and prediction can proceed as usual:
+
 ```@example gru
 training_method = StandardRidge(0.0)
 output_layer = train(esn, training_target, training_method)
@@ -184,25 +211,27 @@ output = esn(Predictive(testing_input), output_layer)
 ```
 
 The results can be plotted using Plots.jl
+
 ```@example gru
 using Plots
 
-plot([testing_target' output'], label=["actual" "predicted"], 
-    plot_title="Santa Fe Laser",
-    titlefontsize=20,
-    legendfontsize=12,
-    linewidth=2.5,
+plot([testing_target' output'], label = ["actual" "predicted"],
+    plot_title = "Santa Fe Laser",
+    titlefontsize = 20,
+    legendfontsize = 12,
+    linewidth = 2.5,
     xtickfontsize = 12,
     ytickfontsize = 12,
-    size=(1080, 720))
+    size = (1080, 720))
 ```
 
 It is interesting to see a comparison of the GRU driven ESN and the standard RNN driven ESN. Using the same parameters defined before it is possible to do the following
+
 ```@example gru
 using StatsBase
 
-esn_rnn = ESN(training_input; 
-    reservoir = RandSparseReservoir(res_size, radius=res_radius),
+esn_rnn = ESN(training_input;
+    reservoir = RandSparseReservoir(res_size, radius = res_radius),
     reservoir_driver = RNN())
 
 output_layer = train(esn_rnn, training_target, training_method)
diff --git a/docs/src/esn_tutorials/hybrid.md b/docs/src/esn_tutorials/hybrid.md
index 2a7b72b7..bf274f01 100644
--- a/docs/src/esn_tutorials/hybrid.md
+++ b/docs/src/esn_tutorials/hybrid.md
@@ -1,48 +1,54 @@
 # Hybrid Echo State Networks
+
 Following the idea of giving physical information to machine learning models, the hybrid echo state networks [^1] try to achieve this results by feeding model data into the ESN. In this example, it is explained how to create and leverage such models in ReservoirComputing.jl. The full script for this example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/hybrid/hybrid.jl). This example was run on Julia v1.7.2.
 
 ## Generating the data
+
 For this example, we are going to forecast the Lorenz system. As usual, the data is generated leveraging `DifferentialEquations.jl`:
+
 ```@example hybrid
 using DifferentialEquations
 
-u0 = [1.0,0.0,0.0]                       
-tspan = (0.0,1000.0)  
+u0 = [1.0, 0.0, 0.0]
+tspan = (0.0, 1000.0)
 datasize = 100000
-tsteps = range(tspan[1], tspan[2], length = datasize)                   
+tsteps = range(tspan[1], tspan[2], length = datasize)
 
-function lorenz(du,u,p,t)
-    p = [10.0,28.0,8/3]
-    du[1] = p[1]*(u[2]-u[1])
-    du[2] = u[1]*(p[2]-u[3]) - u[2]
-    du[3] = u[1]*u[2] - p[3]*u[3]
+function lorenz(du, u, p, t)
+    p = [10.0, 28.0, 8 / 3]
+    du[1] = p[1] * (u[2] - u[1])
+    du[2] = u[1] * (p[2] - u[3]) - u[2]
+    du[3] = u[1] * u[2] - p[3] * u[3]
 end
 
 ode_prob = ODEProblem(lorenz, u0, tspan)
 ode_sol = solve(ode_prob, saveat = tsteps)
-ode_data =Array(ode_sol)
+ode_data = Array(ode_sol)
 
 train_len = 10000
 
-input_data  = ode_data[:, 1:train_len]
-target_data = ode_data[:, 2:train_len+1]
-test_data   = ode_data[:, train_len+1:end][:, 1:1000]
+input_data = ode_data[:, 1:train_len]
+target_data = ode_data[:, 2:(train_len + 1)]
+test_data = ode_data[:, (train_len + 1):end][:, 1:1000]
 
 predict_len = size(test_data, 2)
 tspan_train = (tspan[1], ode_sol.t[train_len])
 ```
 
 ## Building the Hybrid Echo State Network
+
 To feed the data to the ESN, it is necessary to create a suitable function.
+
 ```@example hybrid
 function prior_model_data_generator(u0, tspan, tsteps, model = lorenz)
-    prob = ODEProblem(lorenz, u0, tspan) 
+    prob = ODEProblem(lorenz, u0, tspan)
     sol = Array(solve(prob, saveat = tsteps))
     return sol
 end
 ```
 
 Given the initial condition, time span, and time steps, this function returns the data for the chosen model. Now, using the `Hybrid` method, it is possible to input all this information to the model.
+
 ```@example hybrid
 using ReservoirComputing, Random
 Random.seed!(42)
@@ -55,31 +61,35 @@ esn = ESN(input_data,
 ```
 
 ## Training and Prediction
+
 The training and prediction of the Hybrid ESN can proceed as usual:
+
 ```@example hybrid
 output_layer = train(esn, target_data, StandardRidge(0.3))
 output = esn(Generative(predict_len), output_layer)
 ```
 
 It is now possible to plot the results, leveraging Plots.jl:
+
 ```@example hybrid
 using Plots
 lorenz_maxlyap = 0.9056
-predict_ts = tsteps[train_len+1:train_len+predict_len]
-lyap_time = (predict_ts .- predict_ts[1])*(1/lorenz_maxlyap)
-
-p1 = plot(lyap_time, [test_data[1,:] output[1,:]], label = ["actual" "predicted"], 
-    ylabel = "x(t)", linewidth=2.5, xticks=false, yticks = -15:15:15);
-p2 = plot(lyap_time, [test_data[2,:] output[2,:]], label = ["actual" "predicted"], 
-    ylabel = "y(t)", linewidth=2.5, xticks=false, yticks = -20:20:20);
-p3 = plot(lyap_time, [test_data[3,:] output[3,:]], label = ["actual" "predicted"], 
-    ylabel = "z(t)", linewidth=2.5, xlabel = "max(λ)*t", yticks = 10:15:40);
-
-
-plot(p1, p2, p3, plot_title = "Lorenz System Coordinates", 
-    layout=(3,1), xtickfontsize = 12, ytickfontsize = 12, xguidefontsize=15, yguidefontsize=15,
-    legendfontsize=12, titlefontsize=20)
+predict_ts = tsteps[(train_len + 1):(train_len + predict_len)]
+lyap_time = (predict_ts .- predict_ts[1]) * (1 / lorenz_maxlyap)
+
+p1 = plot(lyap_time, [test_data[1, :] output[1, :]], label = ["actual" "predicted"],
+    ylabel = "x(t)", linewidth = 2.5, xticks = false, yticks = -15:15:15);
+p2 = plot(lyap_time, [test_data[2, :] output[2, :]], label = ["actual" "predicted"],
+    ylabel = "y(t)", linewidth = 2.5, xticks = false, yticks = -20:20:20);
+p3 = plot(lyap_time, [test_data[3, :] output[3, :]], label = ["actual" "predicted"],
+    ylabel = "z(t)", linewidth = 2.5, xlabel = "max(λ)*t", yticks = 10:15:40);
+
+plot(p1, p2, p3, plot_title = "Lorenz System Coordinates",
+    layout = (3, 1), xtickfontsize = 12, ytickfontsize = 12, xguidefontsize = 15,
+    yguidefontsize = 15,
+    legendfontsize = 12, titlefontsize = 20)
 ```
 
 ## Bibliography
+
 [^1]: Pathak, Jaideep, et al. "_Hybrid forecasting of chaotic processes: Using machine learning in conjunction with a knowledge-based model._" Chaos: An Interdisciplinary Journal of Nonlinear Science 28.4 (2018): 041101.
diff --git a/docs/src/esn_tutorials/lorenz_basic.md b/docs/src/esn_tutorials/lorenz_basic.md
index 84b53220..f820a36d 100644
--- a/docs/src/esn_tutorials/lorenz_basic.md
+++ b/docs/src/esn_tutorials/lorenz_basic.md
@@ -1,25 +1,28 @@
- # Lorenz System Forecasting
- 
-This example expands on the readme Lorenz system forecasting to better showcase how to use methods and functions provided in the library for Echo State Networks. Here the prediction method used is ```Generative```, for a more detailed explanation of the differences between ```Generative``` and ```Predictive``` please refer to the other examples given in the documentation. The full script for this example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/lorenz_basic/lorenz_basic.jl). This example was run on Julia v1.7.2.
+# Lorenz System Forecasting
+
+This example expands on the readme Lorenz system forecasting to better showcase how to use methods and functions provided in the library for Echo State Networks. Here the prediction method used is `Generative`, for a more detailed explanation of the differences between `Generative` and `Predictive` please refer to the other examples given in the documentation. The full script for this example is available [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/blob/main/lorenz_basic/lorenz_basic.jl). This example was run on Julia v1.7.2.
 
 ## Generating the data
-Starting off the workflow, the first step is to obtain the data. Leveraging ```OrdinaryDiffEq``` it is possible to derive the Lorenz system data in the following way:
+
+Starting off the workflow, the first step is to obtain the data. Leveraging `OrdinaryDiffEq` it is possible to derive the Lorenz system data in the following way:
+
 ```@example lorenz
 using OrdinaryDiffEq
 
 #define lorenz system
-function lorenz!(du,u,p,t)
-    du[1] = 10.0*(u[2]-u[1])
-    du[2] = u[1]*(28.0-u[3]) - u[2]
-    du[3] = u[1]*u[2] - (8/3)*u[3]
+function lorenz!(du, u, p, t)
+    du[1] = 10.0 * (u[2] - u[1])
+    du[2] = u[1] * (28.0 - u[3]) - u[2]
+    du[3] = u[1] * u[2] - (8 / 3) * u[3]
 end
 
 #solve and take data
-prob = ODEProblem(lorenz!, [1.0,0.0,0.0], (0.0,200.0))
-data = solve(prob, ABM54(), dt=0.02)
+prob = ODEProblem(lorenz!, [1.0, 0.0, 0.0], (0.0, 200.0))
+data = solve(prob, ABM54(), dt = 0.02)
 ```
 
-After obtaining the data, it is necessary to determine the kind of prediction for the model. Since this example will use the ```Generative``` prediction type, this means that the target data will be the next step of the input data. In addition, it is important to notice that the Lorenz system just obtained presents a transient period that is not representative of the general behavior of the system. This can easily be discarded by setting a ```shift``` parameter.
+After obtaining the data, it is necessary to determine the kind of prediction for the model. Since this example will use the `Generative` prediction type, this means that the target data will be the next step of the input data. In addition, it is important to notice that the Lorenz system just obtained presents a transient period that is not representative of the general behavior of the system. This can easily be discarded by setting a `shift` parameter.
+
 ```@example lorenz
 #determine shift length, training length and prediction length
 shift = 300
@@ -27,50 +30,56 @@ train_len = 5000
 predict_len = 1250
 
 #split the data accordingly
-input_data = data[:, shift:shift+train_len-1]
-target_data = data[:, shift+1:shift+train_len]
-test_data = data[:,shift+train_len+1:shift+train_len+predict_len]
+input_data = data[:, shift:(shift + train_len - 1)]
+target_data = data[:, (shift + 1):(shift + train_len)]
+test_data = data[:, (shift + train_len + 1):(shift + train_len + predict_len)]
 ```
 
-It is *important* to notice that the data needs to be formatted in a matrix with the features as rows and time steps as columns as in this example. This is needed even if the time series consists of single values. 
+It is *important* to notice that the data needs to be formatted in a matrix with the features as rows and time steps as columns as in this example. This is needed even if the time series consists of single values.
 
 ## Building the Echo State Network
-Once the data is ready, it is possible to define the parameters for the ESN and the ```ESN``` struct itself. In this example, the values from [^1] are loosely followed as general guidelines.
+
+Once the data is ready, it is possible to define the parameters for the ESN and the `ESN` struct itself. In this example, the values from [^1] are loosely followed as general guidelines.
+
 ```@example lorenz
 using ReservoirComputing
 
 #define ESN parameters
 res_size = 300
 res_radius = 1.2
-res_sparsity = 6/300
+res_sparsity = 6 / 300
 input_scaling = 0.1
 
 #build ESN struct
-esn = ESN(input_data; 
+esn = ESN(input_data;
     variation = Default(),
-    reservoir = RandSparseReservoir(res_size, radius=res_radius, sparsity=res_sparsity),
-    input_layer = WeightedLayer(scaling=input_scaling),
+    reservoir = RandSparseReservoir(res_size, radius = res_radius, sparsity = res_sparsity),
+    input_layer = WeightedLayer(scaling = input_scaling),
     reservoir_driver = RNN(),
     nla_type = NLADefault(),
     states_type = StandardStates())
 ```
 
-Most of the parameters chosen here mirror the default ones, so a direct call is not necessary. The readme example is identical to this one, except for the explicit call. Going line by line to see what is happening, starting from ```res_size```: this value determines the dimensions of the reservoir matrix. In this case, a size of 300 has been chosen, so the reservoir matrix will be 300 x 300. This is not always the case, since some input layer constructions can modify the dimensions of the reservoir, but in that case, everything is taken care of internally. 
+Most of the parameters chosen here mirror the default ones, so a direct call is not necessary. The readme example is identical to this one, except for the explicit call. Going line by line to see what is happening, starting from `res_size`: this value determines the dimensions of the reservoir matrix. In this case, a size of 300 has been chosen, so the reservoir matrix will be 300 x 300. This is not always the case, since some input layer constructions can modify the dimensions of the reservoir, but in that case, everything is taken care of internally.
+
+The `res_radius` determines the scaling of the spectral radius of the reservoir matrix; a proper scaling is necessary to assure the Echo State Property. The default value in the `RandSparseReservoir()` method is 1.0 in accordance with the most commonly followed guidelines found in the literature (see [^2] and references therein). The `sparsity` of the reservoir matrix in this case is obtained by choosing a degree of connections and dividing that by the reservoir size. Of course, it is also possible to simply choose any value between 0.0 and 1.0 to test behaviors for different sparsity values. In this example, the call to the parameters inside `RandSparseReservoir()` was done explicitly to showcase the meaning of each of them, but it is also possible to simply pass the values directly, like so `RandSparseReservoir(1.2, 6/300)`.
 
-The ```res_radius``` determines the scaling of the spectral radius of the reservoir matrix; a proper scaling is necessary to assure the Echo State Property. The default value in the ```RandSparseReservoir()``` method is 1.0 in accordance with the most commonly followed guidelines found in the literature (see [^2] and references therein). The ```sparsity``` of the reservoir matrix in this case is obtained by choosing a degree of connections and dividing that by the reservoir size. Of course, it is also possible to simply choose any value between 0.0 and 1.0 to test behaviors for different sparsity values. In this example, the call to the parameters inside ```RandSparseReservoir()``` was done explicitly to showcase the meaning of each of them, but it is also possible to simply pass the values directly, like so ```RandSparseReservoir(1.2, 6/300)```.
+The value of `input_scaling` determines the upper and lower bounds of the uniform distribution of the weights in the `WeightedLayer()`. Like before, this value can be passed either as an argument or as a keyword argument `WeightedLayer(0.1)`. The value of 0.1 represents the default. The default input layer is the `DenseLayer`, a fully connected layer. The details of the weighted version can be found in [^3], for this example, this version returns the best results.
 
-The value of ```input_scaling``` determines the upper and lower bounds of the uniform distribution of the weights in the ```WeightedLayer()```. Like before, this value can be passed either as an argument or as a keyword argument ```WeightedLayer(0.1)```. The value of 0.1 represents the default. The default input layer is the ```DenseLayer```, a fully connected layer. The details of the weighted version can be found in [^3], for this example, this version returns the best results.
+The reservoir driver represents the dynamics of the reservoir. In the standard ESN definition, these dynamics are obtained through a Recurrent Neural Network (RNN), and this is reflected by calling the `RNN` driver for the `ESN` struct. This option is set as the default, and unless there is the need to change parameters, it is not needed. The full equation is the following:
 
-The reservoir driver represents the dynamics of the reservoir. In the standard ESN definition, these dynamics are obtained through a Recurrent Neural Network (RNN), and this is reflected by calling the ```RNN``` driver for the ```ESN``` struct. This option is set as the default, and unless there is the need to change parameters, it is not needed. The full equation is the following:
 ```math
 \textbf{x}(t+1) = (1-\alpha)\textbf{x}(t) + \alpha \cdot \text{tanh}(\textbf{W}\textbf{x}(t)+\textbf{W}_{\text{in}}\textbf{u}(t))
 ```
-where ``α`` represents the leaky coefficient, and tanh can be any activation function. Also, ``\textbf{x}`` represents the state vector, ``\textbf{u}`` the input data, and ``\textbf{W}, \textbf{W}_{\text{in}}`` are the reservoir matrix and input matrix, respectively. The default call to the RNN in the library is the following ```RNN(;activation_function=tanh, leaky_coefficient=1.0)```, where the meaning of the parameters is clear from the equation above. Instead of the hyperbolic tangent, any activation function can be used, either leveraging external libraries such as ```NNlib``` or creating a custom one. 
 
-The final calls are modifications to the states in training or prediction. The default calls, depicted in the example, do not make any modifications to the states. This is the safest bet if one is not sure how these work. The ```nla_type``` applies a non-linear algorithm to the states, while the ```states_type``` can expand them by concatenating them with the input data, or padding them by concatenating a constant value to all the states. More in depth descriptions of these parameters are given in other examples in the documentation.
+where ``α`` represents the leaky coefficient, and tanh can be any activation function. Also, ``\textbf{x}`` represents the state vector, ``\textbf{u}`` the input data, and ``\textbf{W}, \textbf{W}_{\text{in}}`` are the reservoir matrix and input matrix, respectively. The default call to the RNN in the library is the following `RNN(;activation_function=tanh, leaky_coefficient=1.0)`, where the meaning of the parameters is clear from the equation above. Instead of the hyperbolic tangent, any activation function can be used, either leveraging external libraries such as `NNlib` or creating a custom one.
+
+The final calls are modifications to the states in training or prediction. The default calls, depicted in the example, do not make any modifications to the states. This is the safest bet if one is not sure how these work. The `nla_type` applies a non-linear algorithm to the states, while the `states_type` can expand them by concatenating them with the input data, or padding them by concatenating a constant value to all the states. More in depth descriptions of these parameters are given in other examples in the documentation.
 
 ## Training and Prediction
+
 Now that the ESN has been created and all the parameters have been explained, it is time to proceed with the training. The full call of the readme example follows this general idea:
+
 ```@example lorenz
 #define training method
 training_method = StandardRidge(0.0)
@@ -79,37 +88,41 @@ training_method = StandardRidge(0.0)
 output_layer = train(esn, target_data, training_method)
 ```
 
-The training returns an ```OutputLayer``` struct containing the trained output matrix and other  needed for the prediction. The necessary elements in the ```train()``` call are the ```ESN``` struct created in the previous step and the ```target_data```, which in this case is the one step ahead evolution of the Lorenz system. The training method chosen in this example is the standard one, so an equivalent way of calling the ```train``` function here is ```output_layer = train(esn, target_data)``` like the readme basic version. Likewise, the default value for the ridge regression parameter is set to zero, so the actual default training is Ordinary Least Squares regression. Other training methods are available and will be explained in the following examples. 
+The training returns an `OutputLayer` struct containing the trained output matrix and other  needed for the prediction. The necessary elements in the `train()` call are the `ESN` struct created in the previous step and the `target_data`, which in this case is the one step ahead evolution of the Lorenz system. The training method chosen in this example is the standard one, so an equivalent way of calling the `train` function here is `output_layer = train(esn, target_data)` like the readme basic version. Likewise, the default value for the ridge regression parameter is set to zero, so the actual default training is Ordinary Least Squares regression. Other training methods are available and will be explained in the following examples.
+
+Once the `OutputLayer` has been obtained, the prediction can be done following this procedure:
 
-Once the ```OutputLayer``` has been obtained, the prediction can be done following this procedure:
 ```@example lorenz
 output = esn(Generative(predict_len), output_layer)
 ```
-both the training method and the output layer are needed in this call. The number of steps for the prediction must be specified in the ```Generative``` method. The output results are given in a matrix. 
+
+both the training method and the output layer are needed in this call. The number of steps for the prediction must be specified in the `Generative` method. The output results are given in a matrix.
 
 !!! info "Saving the states during prediction"
+    
     While the states are saved in the `ESN` struct for the training, for the prediction they are not saved by default. To inspect the states, it is necessary to pass the boolean keyword argument `save_states` to the prediction call, in this example using `esn(... ; save_states=true)`. This returns a tuple `(output, states)` where `size(states) = res_size, prediction_len`
 
-To inspect the results, they can easily be plotted using an external library. In this case, ```Plots``` is adopted:
+To inspect the results, they can easily be plotted using an external library. In this case, `Plots` is adopted:
+
 ```@example lorenz
 using Plots, Plots.PlotMeasures
 
 ts = 0.0:0.02:200.0
 lorenz_maxlyap = 0.9056
-predict_ts = ts[shift+train_len+1:shift+train_len+predict_len]
-lyap_time = (predict_ts .- predict_ts[1])*(1/lorenz_maxlyap)
-
-p1 = plot(lyap_time, [test_data[1,:] output[1,:]], label = ["actual" "predicted"], 
-    ylabel = "x(t)", linewidth=2.5, xticks=false, yticks = -15:15:15);
-p2 = plot(lyap_time, [test_data[2,:] output[2,:]], label = ["actual" "predicted"], 
-    ylabel = "y(t)", linewidth=2.5, xticks=false, yticks = -20:20:20);
-p3 = plot(lyap_time, [test_data[3,:] output[3,:]], label = ["actual" "predicted"], 
-    ylabel = "z(t)", linewidth=2.5, xlabel = "max(λ)*t", yticks = 10:15:40);
-
-
-plot(p1, p2, p3, plot_title = "Lorenz System Coordinates", 
-    layout=(3,1), xtickfontsize = 12, ytickfontsize = 12, xguidefontsize=15, yguidefontsize=15,
-    legendfontsize=12, titlefontsize=20)
+predict_ts = ts[(shift + train_len + 1):(shift + train_len + predict_len)]
+lyap_time = (predict_ts .- predict_ts[1]) * (1 / lorenz_maxlyap)
+
+p1 = plot(lyap_time, [test_data[1, :] output[1, :]], label = ["actual" "predicted"],
+    ylabel = "x(t)", linewidth = 2.5, xticks = false, yticks = -15:15:15);
+p2 = plot(lyap_time, [test_data[2, :] output[2, :]], label = ["actual" "predicted"],
+    ylabel = "y(t)", linewidth = 2.5, xticks = false, yticks = -20:20:20);
+p3 = plot(lyap_time, [test_data[3, :] output[3, :]], label = ["actual" "predicted"],
+    ylabel = "z(t)", linewidth = 2.5, xlabel = "max(λ)*t", yticks = 10:15:40);
+
+plot(p1, p2, p3, plot_title = "Lorenz System Coordinates",
+    layout = (3, 1), xtickfontsize = 12, ytickfontsize = 12, xguidefontsize = 15,
+    yguidefontsize = 15,
+    legendfontsize = 12, titlefontsize = 20)
 ```
 
 ## Bibliography
@@ -117,4 +130,3 @@ plot(p1, p2, p3, plot_title = "Lorenz System Coordinates",
 [^1]: Pathak, Jaideep, et al. "_Using machine learning to replicate chaotic attractors and calculate Lyapunov exponents from data._" Chaos: An Interdisciplinary Journal of Nonlinear Science 27.12 (2017): 121102.
 [^2]: Lukoševičius, Mantas. "_A practical guide to applying echo state networks._" Neural networks: Tricks of the trade. Springer, Berlin, Heidelberg, 2012. 659-686.
 [^3]: Lu, Zhixin, et al. "_Reservoir observers: Model-free inference of unmeasured variables in chaotic systems._" Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
-
diff --git a/docs/src/general/different_training.md b/docs/src/general/different_training.md
index 231389c1..bb92680c 100644
--- a/docs/src/general/different_training.md
+++ b/docs/src/general/different_training.md
@@ -1,5 +1,7 @@
 # Changing Training Algorithms
+
 Notably Echo State Networks have been trained with Ridge Regression algorithms, but the range of useful algorithms to use is much greater. In this section of the documentation, it is possible to explore how to use other training methods to obtain the readout layer. All the methods implemented in ReservoirComputing.jl can be used for all models in the library, not only ESNs. The general workflow illustrated in this section will be based on a dummy RC model `my_model = MyModel(...)` that needs training to obtain the readout layer. The training is done as follows:
+
 ```julia
 training_algo = TrainingAlgo()
 readout_layer = train(my_model, train_data, training_algo)
@@ -8,40 +10,23 @@ readout_layer = train(my_model, train_data, training_algo)
 In this section, it is possible to explore how to properly build the `training_algo` and all the possible choices available. In the example section of the documentation it will be provided copy-pasteable code to better explore the training algorithms and their impact on the model.
 
 ## Linear Models
-The library includes a standard implementation of ridge regression, callable using `StandardRidge(regularization_coeff)` where the default value for the regularization coefficient is set to zero. This is also the default model called when no model is specified in `train()`. This makes the default call for training `train(my_model, train_data)` use Ordinary Least Squares (OLS) for regression.
 
-Leveraging [MLJLinearModels](https://juliaai.github.io/MLJLinearModels.jl/stable/) it is possible to expand the choices of linear models used for the training. The wrappers provided are structured in the following way:
-```julia
-struct LinearModel
-    regression
-    solver
-    regression_kwargs
-end
-```
-to call the ridge regression using the MLJLinearModels APIs, one can use `LinearModel(;regression=LinearRegression)`. It is also possible to use a specific solver, by calling `LinearModel(regression=LinearRegression, solver=Analytical())`. For all the available solvers, please refer to the [MLJLinearModels documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/models/). To change the regularization coefficient in the ridge example, using for example `lambda = 0.1`, it is needed to pass it in the `regression_kwargs` like so `LinearModel(;regression=LinearRegression, solver=Analytical(), regression_kwargs=(lambda=lambda))`. The nomenclature of the coefficients must follow the MLJLinearModels APIs, using `lambda, gamma` for `LassoRegression` and `delta, lambda, gamma` for `HuberRegression`. Again, please check the [relevant documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/api/) if in doubt. When using MLJLinearModels based regressors, do remember to specify `using MLJLinearModels`.
+The library includes a standard implementation of ridge regression, callable using `StandardRidge(regularization_coeff)`. The default regularization coefficient is set to zero. This is also the default model called when no model is specified in `train()`. This makes the default call for training `train(my_model, train_data)` use Ordinary Least Squares (OLS) for regression.
+
+Leveraging [MLJLinearModels](https://juliaai.github.io/MLJLinearModels.jl/stable/) you can expand your choices of linear models for training. The wrappers provided follow this structure:
 
-## Gaussian Processes
-Another way to obtain the readout layer is possible using Gaussian regression. This is provided through a wrapper of [GaussianProcesses](http://stor-i.github.io/GaussianProcesses.jl/latest/) structured in the following way:
 ```julia
-struct GaussianProcess
-    mean
-    kernel
-    lognoise
-    optimize
-    optimizer
+struct LinearModel
+    regression::Any
+    solver::Any
+    regression_kwargs::Any
 end
 ```
-While it is necessary to specify a `mean` and a `kernel`, the other defaults are `lognoise=-2, optimize=false, optimizer=Optim.LBFGS()`. For the choice of means and kernels, please refer to the proper documentation, [here](http://stor-i.github.io/GaussianProcesses.jl/latest/mean/) and [here](http://stor-i.github.io/GaussianProcesses.jl/latest/kernels/), respectively. 
 
-Building on the simple example given in the GaussianProcesses documentation, it is possible to build an intuition of how to use these algorithms for training ReservoirComputing.jl models.
-```julia
-mZero = MeanZero()   #Zero mean function
-kern = SE(0.0,0.0)   #Squared exponential kernel (note that hyperparameters are on the log scale)
-logObsNoise = -1.0
+To call the ridge regression using the MLJLinearModels APIs, you can use `LinearModel(;regression=LinearRegression)`. You can also choose a specific solver by calling, for example, `LinearModel(regression=LinearRegression, solver=Analytical())`. For all the available solvers, please refer to the [MLJLinearModels documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/models/).
 
-gp = GaussianProcess(mZero, kern, lognoise=logObsNoise)
-```
-Like in the previous case, if one uses GaussianProcesses based regressors, it is necessary to specify `using GaussianProcesses`. Additionally, if the optimizer chosen is from an external package, i.e. Optim, that package needs to be used in the script as well by adding `using Optim`.
+To change the regularization coefficient in the ridge example, using for example `lambda = 0.1`, it is needed to pass it in the `regression_kwargs` like so `LinearModel(;regression=LinearRegression, solver=Analytical(), regression_kwargs=(lambda=lambda))`. The nomenclature of the coefficients must follow the MLJLinearModels APIs, using `lambda, gamma` for `LassoRegression` and `delta, lambda, gamma` for `HuberRegression`. Again, please check the [relevant documentation](https://juliaai.github.io/MLJLinearModels.jl/stable/api/) if in doubt. When using MLJLinearModels based regressors, do remember to specify `using MLJLinearModels`.
 
 ## Support Vector Regression
-Contrary to the `LinearModel`s and `GaussianProcess`es, no wrappers are needed for support vector regression. By using [LIBSVM.jl](https://github.com/JuliaML/LIBSVM.jl), LIBSVM wrappers in Julia, it is possible to call both `epsilonSVR()` or `nuSVR()` directly in `train()`. For the full range of kernels provided and the parameters to call, we refer the user to the official [documentation](https://www.csie.ntu.edu.tw/~cjlin/libsvm/). Like before, if one intends to use LIBSVM regressors, it is necessary to specify `using LIBSVM`.
+
+Contrary to the `LinearModel`s, no wrappers are needed for support vector regression. By using [LIBSVM.jl](https://github.com/JuliaML/LIBSVM.jl), LIBSVM wrappers in Julia, it is possible to call both `epsilonSVR()` or `nuSVR()` directly in `train()`. For the full range of kernels provided and the parameters to call, we refer the user to the official [documentation](https://www.csie.ntu.edu.tw/%7Ecjlin/libsvm/). Like before, if one intends to use LIBSVM regressors, it is necessary to specify `using LIBSVM`.
diff --git a/docs/src/general/predictive_generative.md b/docs/src/general/predictive_generative.md
index 0c61f08c..c3be7b26 100644
--- a/docs/src/general/predictive_generative.md
+++ b/docs/src/general/predictive_generative.md
@@ -1,10 +1,17 @@
 # Generative vs Predictive
-The library provides two different methods for prediction, denoted as `Predictive()` and `Generative()`, following the two major applications of Reservoir Computing models found in the literature. Both of these methods are given as arguments for the trained model. While copy-pasteable example swill be provided further on in the documentation, it is better to clarify the difference early on to focus more on the library implementation going forward.
+
+The library provides two different methods for prediction, denoted as `Predictive()` and `Generative()`. These methods correspond to the two major applications of Reservoir Computing models found in the literature. This section aims to clarify the differences between these two methods before providing further details on their usage in the library.
 
 ## Predictive
-In the first method, the user can use Reservoir Computing models similarly as standard Machine Learning models. This means using a set of features as input and a set of labels as outputs. In this case, the features and labels can be vectors of different dimensions, as ``X=\{x_1,...,x_n\} \ x_i \in \mathbb{R}^{N}`` and ``Y=\{y_1,...,y_n\} \ y_i \in \mathbb{R}^{M}`` where ``X`` is the feature set and ``Y`` the label set. Given the difference in dimensionality for the prediction call, it will be needed to feed to the function the feature set to be labeled, for example by calling `Predictive(X)` using the set given in this example.
 
-!this allows for one step ahead or h steps ahead prediction.
+In the first method, users can utilize Reservoir Computing models in a manner similar to standard Machine Learning models. This involves using a set of features as input and a set of labels as outputs. In this case, both the feature and label sets can consist of vectors of different dimensions. Specifically, let's denote the feature set as ``X=\{x_1,...,x_n\}`` where ``x_i \in \mathbb{R}^{N}``, and the label set as ``Y=\{y_1,...,y_n\}`` where ``y_i \in \mathbb{R}^{M}``.
+
+To make predictions using this method, you need to provide the feature set that you want to predict the labels for. For example, you can call `Predictive(X)` using the feature set ``X`` as input. This method allows for both one-step-ahead and multi-step-ahead predictions.
 
 ## Generative
-The generative method allows the user to extend the forecasting capabilities of the model, letting the predicted results to be fed back into the model to generate the next prediction. By doing so, the model can run autonomously, without any feature dataset as input. The call for this model needs only the number of steps that the user intends to forecast, for example calling `Generative(100)` to generate one hundred time steps.
+
+The generative method provides a different approach to forecasting with Reservoir Computing models. It enables you to extend the forecasting capabilities of the model by allowing predicted results to be fed back into the model to generate the next prediction. This autonomy allows the model to make predictions without the need for a feature dataset as input.
+
+To use the generative method, you only need to specify the number of time steps that you intend to forecast. For instance, you can call `Generative(100)` to generate predictions for the next one hundred time steps.
+
+The key distinction between these methods lies in how predictions are made. The predictive method relies on input feature sets to make predictions, while the generative method allows for autonomous forecasting by feeding predicted results back into the model.
diff --git a/docs/src/general/states_variation.md b/docs/src/general/states_variation.md
index 521fbdad..a1b0dab5 100644
--- a/docs/src/general/states_variation.md
+++ b/docs/src/general/states_variation.md
@@ -1,38 +1,53 @@
 # Altering States
-In every ReservoirComputing model is possible to perform some alteration on the states in the training stage. Depending on the chosen modification, this can improve the results of the prediction. Or more simply, they can be used to reproduce results in the literature. The alterations are divided into two possibilities: the first concerns padding or extending the states, and the second concerns non-linear algorithms performed over the states.
+
+In ReservoirComputing models, it's possible to perform alterations on the reservoir states during the training stage. These alterations can improve prediction results or replicate results found in the literature. Alterations are categorized into two possibilities: padding or extending the states, and applying non-linear algorithms to the states.
 
 ## Padding and Extending States
-Extending the states means appending to them the corresponding input values. If ``\textbf{x}(t)`` is the reservoir state at time t corresponding to the input ``\textbf{u}(t)`` the extended state will be represented as `` [\textbf{x}(t); \textbf{u}(t)]`` where ``[;]`` is intended as vertical concatenation. This procedure is, for example, used in [Jaeger's Scholarpedia](http://www.scholarpedia.org/article/Echo_state_network) description of Echo State Networks. The extension of the states can be obtained in every ReservoirComputing.jl model by using the keyword argument `states_type` and calling the method `ExtendedStates()`. No argument is needed.
 
-Padding the states means appending a constant value, 1.0 for example, to each state. Using the notation introduced before, we can define the padded states as ``[\textbf{x}(t); 1.0]``. This approach is detailed in the [seminal guide](https://mantas.info/get-publication/?f=Practical_ESN.pdf) to Echo State Networks by Mantas Lukoševičius. By using the keyword argument `states_type` the user can call the method `PaddedStates(padding)` where `padding` represents the value that will be concatenated to the states. As default, the value is set to unity, so the majority of the time, calling `PaddedStates()` will suffice.
+### Extending States
+
+Extending the states involves appending the corresponding input values to the reservoir states. If \(\textbf{x}(t)\) represents the reservoir state at time \(t\) corresponding to the input \(\textbf{u}(t)\), the extended state is represented as \([\textbf{x}(t); \textbf{u}(t)]\), where \([;]\) denotes vertical concatenation. This procedure is commonly used in Echo State Networks and is described in [Jaeger's Scholarpedia](http://www.scholarpedia.org/article/Echo_state_network). You can extend the states in every ReservoirComputing.jl model by using the `states_type` keyword argument and calling the `ExtendedStates()` method. No additional arguments are needed.
+
+### Padding States
 
-Though not easily found in the literature, it is also possible to pad the extended states by using the method `PaddedExtendedStates(padding)` that has unity as `padding` default as well.
+Padding the states involves appending a constant value, such as 1.0, to each state. In the notation introduced earlier, padded states can be represented as \([\textbf{x}(t); 1.0]\). This approach is detailed in the [seminal guide](https://mantas.info/get-publication/?f=Practical_ESN.pdf) to Echo State Networks by Mantas Lukoševičius. To pad the states, you can use the `states_type` keyword argument and call the `PaddedStates(padding)` method, where `padding` represents the value to be concatenated to the states. By default, the padding value is set to 1.0, so most of the time, calling `PaddedStates()` will suffice.
 
-Of course, it is also possible to not apport any of these changes to the states by calling `StandardStates()`. This is also the default choice for the states.
+Additionally, you can pad the extended states by using the `PaddedExtendedStates(padding)` method, which also has a default padding value of 1.0.
+
+You can choose not to apply any of these changes to the states by calling `StandardStates()`, which is the default choice for the states.
 
 ## Non-Linear Algorithms
-First introduced in [^1] and expanded in [^2] these are nonlinear combinations of the columns of the matrix states. There are three such algorithms implemented. Using the keyword argument `nla_type` it is possible to choose in every model in ReservoirComputing.jl the specific non-linear algorithm to use. The default value is set to `NLADefault()`, where no non-linear algorithm takes place.
 
-Following the nomenclature used in [^2], the algorithms can be called as `NLAT1()`, `NLAT2()` and `NLAT3()`. To better explain what they do, let ``\textbf{x}_{i, j}`` be elements of the state matrix, with ``i=1,...,T \ j=1,...,N`` where ``T`` is the length of the training and ``N`` is the reservoir size. 
+First introduced in [^1] and expanded in [^2], non-linear algorithms are nonlinear combinations of the columns of the matrix states. There are three such algorithms implemented in ReservoirComputing.jl, and you can choose which one to use with the `nla_type` keyword argument. The default value is set to `NLADefault()`, which means no non-linear algorithm is applied.
+
+The available non-linear algorithms are:
+
+  - `NLAT1()`
+  - `NLAT2()`
+  - `NLAT3()`
+
+These algorithms perform specific operations on the reservoir states. To provide a better understanding of what they do, let ``\textbf{x}_{i, j}`` be elements of the state matrix, with ``i=1,...,T \ j=1,...,N`` where ``T`` is the length of the training and ``N`` is the reservoir size.
 
 **NLAT1**
+
 ```math
 \tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j} \times \textbf{x}_{i,j} \ \ \text{if \textit{j} is odd} \\
 \tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if \textit{j} is even}
 ```
 
 **NLAT2**
+
 ```math
 \tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j-1} \times \textbf{x}_{i,j-2} \ \ \text{if \textit{j} > 1 is odd} \\
 \tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if \textit{j} is 1 or even}
 ```
 
 **NLAT3**
+
 ```math
 \tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j-1} \times \textbf{x}_{i,j+1} \ \ \text{if \textit{j} > 1 is odd} \\
 \tilde{\textbf{x}}_{i,j} = \textbf{x}_{i,j}  \ \ \text{if \textit{j} is 1 or even}
 ```
 
 [^1]: Pathak, Jaideep, et al. "_Using machine learning to replicate chaotic attractors and calculate Lyapunov exponents from data._" Chaos: An Interdisciplinary Journal of Nonlinear Science 27.12 (2017): 121102.
-
 [^2]: Chattopadhyay, Ashesh, Pedram Hassanzadeh, and Devika Subramanian. "_Data-driven predictions of a multiscale Lorenz 96 chaotic system using machine-learning methods: reservoir computing, artificial neural network, and long short-term memory network._" Nonlinear Processes in Geophysics 27.3 (2020): 373-389.
diff --git a/docs/src/index.md b/docs/src/index.md
index e2add232..ac8e1f66 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,103 +1,53 @@
 # ReservoirComputing.jl
 
-ReservoirComputing.jl provides an efficient, modular, and easy to use implementation of Reservoir Computing models such as Echo State Networks (ESNs). Reservoir Computing (RC) is an umbrella term used to describe a family of models such as ESNs and Liquid State Machines (LSMs). The key concept is to expand the input data into a higher dimension and use regression to train the model; in some ways, Reservoir Computers can be considered similar to kernel methods. 
-
+ReservoirComputing.jl is a versatile and user-friendly Julia package designed for the implementation of advanced Reservoir Computing models, such as Echo State Networks (ESNs). Central to Reservoir Computing is the expansion of input data into a higher-dimensional space, leveraging regression techniques for effective model training. This approach bears resemblance to kernel methods, offering a unique perspective in machine learning. ReservoirComputing.jl offers a modular design, ensuring both ease of use for newcomers and flexibility for advanced users, establishing it as a key tool for innovative computing solutions.
 
 !!! info "Introductory material"
-    This library assumes some basic knowledge of Reservoir Computing. For a good introduction, we suggest the following papers: the first two are the seminal papers about ESN and LSM, the others are in-depth review papers that should cover all the needed information. For the majority of the algorithms implemented in this library, we cited in the documentation the original work introducing them. If you ever have doubts about a method or a function, just type ```? function``` in the Julia REPL to read the relevant notes.
-
-    * Jaeger, Herbert: The “echo state” approach to analyzing and training recurrent neural networks-with an erratum note.
-    * Maass W, Natschläger T, Markram H: Real-time computing without stable states: a new framework for neural computation based on perturbations.
-    * Lukoševičius, Mantas: A practical guide to applying echo state networks." Neural networks: Tricks of the trade.
-    * Lukoševičius, Mantas, and Herbert Jaeger: Reservoir computing approaches to recurrent neural network training.
     
+    This library assumes some basic knowledge of Reservoir Computing. For a good introduction, we suggest the following papers: the first two are the seminal papers about ESN and LSM, the others are in-depth review papers that should cover all the needed information. For the majority of the algorithms implemented in this library we cited in the documentation the original work introducing them. If you ever are in doubt about a method or a function just type `? function` in the Julia REPL to read the relevant notes.
+    
+      - Jaeger, Herbert: The “echo state” approach to analyzing and training recurrent neural networks-with an erratum note.
+      - Maass W, Natschläger T, Markram H: Real-time computing without stable states: a new framework for neural computation based on perturbations.
+      - Lukoševičius, Mantas: A practical guide to applying echo state networks." Neural networks: Tricks of the trade.
+      - Lukoševičius, Mantas, and Herbert Jaeger: Reservoir computing approaches to recurrent neural network training.
+
 !!! info "Performance tip"
-    For faster computations on the CPU, it is suggested to add `using MKL` to the script. For clarity's sake, this library will not be indicated under every example in the documentation.
+    
+    For faster computations on the CPU it is suggested to add `using MKL` to the script. For clarity's sake this library will not be indicated under every example in the documentation.
+
 ## Installation
 
-To install ReservoirComputing.jl, use the Julia package manager:
+To install ReservoirComputing.jl, ensure you have Julia version 1.6 or higher. Follow these steps:
+
+    1. Open the Julia command line.
+    2. Enter the Pkg REPL mode by pressing ].
+    3. Type add ReservoirComputing and press Enter.
+
+For a more customized installation or to contribute to the package, consider cloning the repository:
 
 ```julia
 using Pkg
-Pkg.add("ReservoirComputing")
+Pkg.clone("https://github.com/SciML/ReservoirComputing.jl.git")
 ```
-The support for this library is for Julia v1.6 or greater.
+
+or `dev` the package.
 
 ## Features Overview
 
-This library provides multiple ways of training the chosen RC model. More specifically, the available algorithms are:
-- ```StandardRidge```: a naive implementation of Ridge Regression. The default choice for training.
-- ```LinearModel```: a wrap around [MLJLinearModels](https://juliaai.github.io/MLJLinearModels.jl/stable/).
-- ```LIBSVM.AbstractSVR```: a direct call of [LIBSVM](https://github.com/JuliaML/LIBSVM.jl) regression methods.
-
-Also provided are two different ways of making predictions using RC:
-- ```Generative```: the algorithm uses the prediction of the model in the previous step to continue the prediction. It only needs the number of steps as input.
-- ```Predictive```: standard Machine Learning type of prediction. Given the features, the RC model will return the label/prediction.
-
-It is possible to modify the RC obtained states in the training and prediction steps using the following:
-- ```StandardStates```: default choice, no changes will be made to the states.
-- ```ExtendedStates```: the states are extended using a vertical concatenation, with the input data.
-- ```PaddedStates```: the states are padded using a vertical concatenation with the chosen padding value.
-- ```PaddedExtendedStates```: a combination of the first two. First, the states are extended and then padded.
-
-In addition, another modification is possible through the choice of non-linear algorithms:
-- ```NLADefault```: default choice, no changes will be made to the states.
-- ```NLAT1```
-- ```NLAT2```
-- ```NLAT3```
-
-### Echo State Networks
-For ESNs the following input layers are implemented :
-- ```WeightedLayer```: weighted layer matrix with weights sampled from a uniform distribution.
-- ```DenseLayer```: dense layer matrix with weights sampled from a uniform distribution.
-- ```SparseLayer```: sparse layer matrix with weights sampled from a uniform distribution.
-- ```MinimumLayer```: matrix with constant weights and weight sign decided following one of the two:
-  - ```BernoulliSample```
-  - ```IrrationalSample```
-- ```InformedLayer```: special kin of weighted layer matrix for Hybrid ESNs.
- 
-The package also contains multiple implementations of Reservoirs:
-- ```RandSparseReservoir```: random sparse matrix with scaling of spectral radius
-- ```PseudoSVDReservoir```: Pseudo SVD construction of a random sparse matrix
-- ```DelayLineReservoir```: minimal matrix with chosen weights
-- ```DelayLineBackwardReservoir```: minimal matrix with chosen weights
-- ```SimpleCycleReservoir```: minimal matrix with chosen weights
-- ```CycleJumpsReservoir```: minimal matrix with chosen weights
- 
-In addition, multiple ways of driving the reservoir states are also provided:
-- ```RNN```: standard Recurrent Neural Network driver.
-- ```MRNN```: Multiple RNN driver, it consists of a linear combination of RNNs
-- ```GRU```: gated Recurrent Unit driver, with all the possible GRU variants available:
-  - ```FullyGated```
-  - ```Minimal```
-
-A hybrid version of the model is also available through ```Hybrid```
-
-### Reservoir Computing with Cellular Automata
-The package provides also an implementation of Reservoir Computing models based on one dimensional Cellular Automata through the ```RECA``` call. For the moment, the only input encoding available (an input encoding plays a similar role to the input matrix for ESNs) is a random mapping, called through ```RandomMapping```. 
-
-All the training methods described above can be used, as can all the modifications to the states. Both prediction methods are also possible in theory, although in the literature only ```Predictive``` tasks have been explored.
+  - **Multiple Training Algorithms**: Supports Ridge Regression, Linear Models, and LIBSVM regression methods for Reservoir Computing models.
+  - **Diverse Prediction Methods**: Offers both generative and predictive methods for Reservoir Computing predictions.
+  - **Modifiable Training and Prediction**: Allows modifications in Reservoir Computing states, such as state extension, padding, and combination methods.
+  - **Non-linear Algorithm Options**: Includes options for non-linear modifications in algorithms.
+  - **Echo State Networks (ESNs)**: Features various input layers, reservoirs, and methods for driving ESN reservoir states.
+  - **Cellular Automata-Based Reservoir Computing**: Introduces models based on one-dimensional Cellular Automata for Reservoir Computing.
 
 ## Contributing
 
-  - Please refer to the
-    [SciML ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://github.com/SciML/ColPrac/blob/master/README.md)
-    for guidance on PRs, issues, and other matters relating to contributing to SciML.
-
-  - See the [SciML Style Guide](https://github.com/SciML/SciMLStyle) for common coding practices and other style decisions.
-  - There are a few community forums:
-    
-      + The #diffeq-bridged and #sciml-bridged channels in the
-        [Julia Slack](https://julialang.org/slack/)
-      + The #diffeq-bridged and #sciml-bridged channels in the
-        [Julia Zulip](https://julialang.zulipchat.com/#narrow/stream/279055-sciml-bridged)
-      + On the [Julia Discourse forums](https://discourse.julialang.org)
-      + See also [SciML Community page](https://sciml.ai/community/)
-
+Contributions to ReservoirComputing.jl are highly encouraged and appreciated. Whether it's through implementing new RC model variations, enhancing documentation, adding examples, or any improvement, your contribution is valuable. We welcome posts of relevant papers or ideas in the issues section. For deeper insights into the library's functionality, the API section in the documentation is a great resource. For any queries not suited for issues, please reach out to the lead developers via Slack or email.
 
 ## Citing
 
-If you use this library in your work, please cite:
+If you use ReservoirComputing.jl in your work, we kindly ask you to cite it. Here is the BibTeX entry for your convenience:
 
 ```bibtex
 @article{JMLR:v23:22-0611,
@@ -113,36 +63,46 @@ If you use this library in your work, please cite:
 ```
 
 ## Reproducibility
+
 ```@raw html
 <details><summary>The documentation of this SciML package was built using these direct dependencies,</summary>
 ```
+
 ```@example
 using Pkg # hide
 Pkg.status() # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 <details><summary>and using this machine and Julia version.</summary>
 ```
+
 ```@example
 using InteractiveUtils # hide
 versioninfo() # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 <details><summary>A more complete overview of all dependencies and their versions is also provided.</summary>
 ```
+
 ```@example
 using Pkg # hide
-Pkg.status(;mode = PKGMODE_MANIFEST) # hide
+Pkg.status(; mode = PKGMODE_MANIFEST) # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@eval
 using TOML
 using Markdown
diff --git a/docs/src/reca_tutorials/reca.md b/docs/src/reca_tutorials/reca.md
index 68410541..65c88044 100644
--- a/docs/src/reca_tutorials/reca.md
+++ b/docs/src/reca_tutorials/reca.md
@@ -5,7 +5,9 @@ Reservoir Computing based on Elementary Cellular Automata (ECA) has been recentl
 To showcase how to use these models, this page illustrates the performance of ReCA in the 5 bit memory task [^4]. The script for the example and companion data can be found [here](https://github.com/MartinuzziFrancesco/reservoir-computing-examples/tree/main/reca).
 
 ## 5 bit memory task
+
 The data can be read as follows:
+
 ```@example reca
 using DelimitedFiles
 
@@ -14,6 +16,7 @@ output = readdlm("./5bitoutput.txt", ',', Float32)
 ```
 
 To use a ReCA model, it is necessary to define the rule one intends to use. To do so, ReservoirComputing.jl leverages [CellularAutomata.jl](https://github.com/MartinuzziFrancesco/CellularAutomata.jl) that needs to be called as well to define the `RECA` struct:
+
 ```@example reca
 using ReservoirComputing, CellularAutomata
 
@@ -21,18 +24,21 @@ ca = DCA(90)
 ```
 
 To define the ReCA model, it suffices to call:
+
 ```@example reca
-reca = RECA(input, ca; 
+reca = RECA(input, ca;
     generations = 16,
     input_encoding = RandomMapping(16, 40))
 ```
 
-After this, the training can be performed with the chosen method. 
+After this, the training can be performed with the chosen method.
+
 ```@example reca
 output_layer = train(reca, output, StandardRidge(0.00001))
 ```
 
 The prediction in this case will be a `Predictive()` with the input data equal to the training data. In addition, to test the 5 bit memory task, a conversion from Float to Bool is necessary (at the moment, we are aware of a bug that doesn't allow boolean input data to the RECA models):
+
 ```@example reca
 prediction = reca(Predictive(input), output_layer)
 final_pred = convert(AbstractArray{Float32}, prediction .> 0.5)
@@ -41,9 +47,6 @@ final_pred == output
 ```
 
 [^1]: Yilmaz, Ozgur. "Reservoir computing using cellular automata." arXiv preprint arXiv:1410.0162 (2014).
-
 [^2]: Margem, Mrwan, and Ozgür Yilmaz. "An experimental study on cellular automata reservoir in pathological sequence learning tasks." (2017).
-
 [^3]: Nichele, Stefano, and Andreas Molund. "Deep reservoir computing using cellular automata." arXiv preprint arXiv:1703.02806 (2017).
-
 [^4]: Hochreiter, Sepp, and Jürgen Schmidhuber. "Long short-term memory." Neural computation 9.8 (1997): 1735-1780.
diff --git a/src/esn/echostatenetwork.jl b/src/esn/echostatenetwork.jl
index 1230ee5f..42fab481 100644
--- a/src/esn/echostatenetwork.jl
+++ b/src/esn/echostatenetwork.jl
@@ -16,7 +16,9 @@ end
 """
     Default()
 
-Sets the type of the ESN as the standard model. No parameters are needed.
+The `Default` struct specifies the use of the standard model in Echo State Networks (ESNs).
+It requires no parameters and is used when no specific variations or customizations of the ESN model are needed.
+This struct is ideal for straightforward applications where the default ESN settings are sufficient.
 """
 struct Default <: AbstractVariation end
 struct Hybrid{T, K, O, I, S, D} <: AbstractVariation
@@ -31,11 +33,24 @@ end
 """
     Hybrid(prior_model, u0, tspan, datasize)
 
-Given the model parameters, returns an ```Hybrid``` variation of the ESN. This entails
-a different training and prediction. Construction based on [1].
+Constructs a `Hybrid` variation of Echo State Networks (ESNs) integrating a knowledge-based model
+(`prior_model`) with ESNs for advanced training and prediction in chaotic systems. 
 
-[1] Jaideep Pathak et al. "Hybrid Forecasting of Chaotic Processes: Using Machine
-Learning in Conjunction with a Knowledge-Based Model" (2018)
+# Parameters
+- `prior_model`: A knowledge-based model function for integration with ESNs.
+- `u0`: Initial conditions for the model.
+- `tspan`: Time span as a tuple, indicating the duration for model operation.
+- `datasize`: The size of the data to be processed.
+
+# Returns
+- A `Hybrid` struct instance representing the combined ESN and knowledge-based model.
+
+This method is effective for chaotic processes as highlighted in [^Pathak].
+
+Reference:
+[^Pathak]: Jaideep Pathak et al.
+    "Hybrid Forecasting of Chaotic Processes:
+    Using Machine Learning in Conjunction with a Knowledge-Based Model" (2018).
 """
 function Hybrid(prior_model, u0, tspan, datasize)
     trange = collect(range(tspan[1], tspan[2], length = datasize))
@@ -47,39 +62,44 @@ function Hybrid(prior_model, u0, tspan, datasize)
 end
 
 """
-    ESN(train_data;
+    ESN(train_data; kwargs...) -> ESN
+
+Creates an Echo State Network (ESN) using specified parameters and training data, suitable for various machine learning tasks.
+
+# Parameters
+- `train_data`: Matrix of training data (columns as time steps, rows as features).
+- `variation`: Variation of ESN (default: `Default()`).
+- `input_layer`: Input layer of ESN (default: `DenseLayer()`).
+- `reservoir`: Reservoir of the ESN (default: `RandSparseReservoir(100)`).
+- `bias`: Bias vector for each time step (default: `NullLayer()`).
+- `reservoir_driver`: Mechanism for evolving reservoir states (default: `RNN()`).
+- `nla_type`: Non-linear activation type (default: `NLADefault()`).
+- `states_type`: Format for storing states (default: `StandardStates()`).
+- `washout`: Initial time steps to discard (default: `0`).
+- `matrix_type`: Type of matrices used internally (default: type of `train_data`).
+
+# Returns
+- An initialized ESN instance with specified parameters.
+
+# Examples
+```julia
+using ReservoirComputing
+
+train_data = rand(10, 100)  # 10 features, 100 time steps
+
+esn = ESN(train_data, reservoir=RandSparseReservoir(200), washout=10)
+```
+"""
+function ESN(train_data;
         variation = Default(),
         input_layer = DenseLayer(),
-        reservoir = RandSparseReservoir(),
+        reservoir = RandSparseReservoir(100),
         bias = NullLayer(),
         reservoir_driver = RNN(),
         nla_type = NLADefault(),
-        states_type = StandardStates())
-    (esn::ESN)(prediction::AbstractPrediction,
-        output_layer::AbstractOutputLayer;
-        initial_conditions=output_layer.last_value,
-        last_state=esn.states[:, end])
-
-Constructor for the Echo State Network model. It requires the reservoir size as the input
-and the data for the training. It returns a struct ready to be trained with the states
-already harvested.
-
-After the training, this struct can be used for the prediction following the second
-function call. This will take as input a prediction type and the output layer from the
-training. The ```initial_conditions``` and ```last_state``` parameters can be left as
-they are, unless there is a specific reason to change them. All the components are
-detailed in the API documentation. More examples are given in the general documentation.
-"""
-function ESN(train_data;
-             variation = Default(),
-             input_layer = DenseLayer(),
-             reservoir = RandSparseReservoir(100),
-             bias = NullLayer(),
-             reservoir_driver = RNN(),
-             nla_type = NLADefault(),
-             states_type = StandardStates(),
-             washout = 0,
-             matrix_type = typeof(train_data))
+        states_type = StandardStates(),
+        washout = 0,
+        matrix_type = typeof(train_data))
     if variation isa Hybrid
         train_data = vcat(train_data, variation.model_data[:, 1:(end - 1)])
     end
@@ -87,19 +107,19 @@ function ESN(train_data;
     if states_type isa AbstractPaddedStates
         in_size = size(train_data, 1) + 1
         train_data = vcat(Adapt.adapt(matrix_type, ones(1, size(train_data, 2))),
-                          train_data)
+            train_data)
     else
         in_size = size(train_data, 1)
     end
 
     input_matrix, reservoir_matrix, bias_vector, res_size = obtain_layers(in_size,
-                                                                          input_layer,
-                                                                          reservoir, bias;
-                                                                          matrix_type = matrix_type)
+        input_layer,
+        reservoir, bias;
+        matrix_type = matrix_type)
 
     inner_res_driver = reservoir_driver_params(reservoir_driver, res_size, in_size)
     states = create_states(inner_res_driver, train_data, washout, reservoir_matrix,
-                           input_matrix, bias_vector)
+        input_matrix, bias_vector)
     train_data = train_data[:, (washout + 1):end]
 
     ESN(sum(res_size), train_data, variation, nla_type, input_matrix,
@@ -109,13 +129,13 @@ end
 
 #shallow esn construction
 function obtain_layers(in_size,
-                       input_layer,
-                       reservoir,
-                       bias;
-                       matrix_type = Matrix{Float64})
+        input_layer,
+        reservoir,
+        bias;
+        matrix_type = Matrix{Float64})
     input_res_size = get_ressize(reservoir)
     input_matrix = create_layer(input_layer, input_res_size, in_size,
-                                matrix_type = matrix_type)
+        matrix_type = matrix_type)
     res_size = size(input_matrix, 1) #WeightedInput actually changes the res size
     reservoir_matrix = create_reservoir(reservoir, res_size, matrix_type = matrix_type)
     @assert size(reservoir_matrix, 1) == res_size
@@ -127,10 +147,10 @@ end
 #there is a bug going on with WeightedLayer in this construction.
 #it works for eny other though
 function obtain_layers(in_size,
-                       input_layer,
-                       reservoir::Vector,
-                       bias;
-                       matrix_type = Matrix{Float64})
+        input_layer,
+        reservoir::Vector,
+        bias;
+        matrix_type = Matrix{Float64})
     esn_depth = length(reservoir)
     input_res_sizes = [get_ressize(reservoir[i]) for i in 1:esn_depth]
     in_sizes = zeros(Int, esn_depth)
@@ -139,16 +159,16 @@ function obtain_layers(in_size,
 
     if input_layer isa Array
         input_matrix = [create_layer(input_layer[j], input_res_sizes[j], in_sizes[j],
-                                     matrix_type = matrix_type) for j in 1:esn_depth]
+            matrix_type = matrix_type) for j in 1:esn_depth]
     else
         _input_layer = fill(input_layer, esn_depth)
         input_matrix = [create_layer(_input_layer[k], input_res_sizes[k], in_sizes[k],
-                                     matrix_type = matrix_type) for k in 1:esn_depth]
+            matrix_type = matrix_type) for k in 1:esn_depth]
     end
 
     res_sizes = [get_ressize(input_matrix[j]) for j in 1:esn_depth]
     reservoir_matrix = [create_reservoir(reservoir[k], res_sizes[k],
-                                         matrix_type = matrix_type) for k in 1:esn_depth]
+        matrix_type = matrix_type) for k in 1:esn_depth]
 
     if bias isa Array
         bias_vector = [create_layer(bias[j], res_sizes[j], 1, matrix_type = matrix_type)
@@ -163,9 +183,9 @@ function obtain_layers(in_size,
 end
 
 function (esn::ESN)(prediction::AbstractPrediction,
-                    output_layer::AbstractOutputLayer;
-                    last_state = esn.states[:, [end]],
-                    kwargs...)
+        output_layer::AbstractOutputLayer;
+        last_state = esn.states[:, [end]],
+        kwargs...)
     variation = esn.variation
     pred_len = prediction.prediction_len
 
@@ -177,25 +197,56 @@ function (esn::ESN)(prediction::AbstractPrediction,
         u0 = variation.model_data[:, end]
         model_pred_data = model(u0, tspan_new, predict_tsteps)[:, 2:end]
         return obtain_esn_prediction(esn, prediction, last_state, output_layer,
-                                     model_pred_data;
-                                     kwargs...)
+            model_pred_data;
+            kwargs...)
     else
         return obtain_esn_prediction(esn, prediction, last_state, output_layer;
-                                     kwargs...)
+            kwargs...)
     end
 end
 
 #training dispatch on esn
 """
-    train(esn::AbstractEchoStateNetwork, target_data, training_method=StandardRidge(0.0))
+    train(esn::AbstractEchoStateNetwork, target_data, training_method = StandardRidge(0.0))
+
+Trains an Echo State Network (ESN) using the provided target data and a specified training method.
+
+# Parameters
+- `esn::AbstractEchoStateNetwork`: The ESN instance to be trained.
+- `target_data`: Supervised training data for the ESN.
+- `training_method`: The method for training the ESN (default: `StandardRidge(0.0)`).
+
+# Returns
+- The trained ESN model. Its type and structure depend on `training_method` and the ESN's implementation.
+
+
+# Returns
+The trained ESN model. The exact type and structure of the return value depends on the
+`training_method` and the specific ESN implementation.
+
+```julia
+using ReservoirComputing
+
+# Initialize an ESN instance and target data
+esn = ESN(train_data, reservoir=RandSparseReservoir(200), washout=10)
+target_data = rand(size(train_data, 2))
+
+# Train the ESN using the default training method
+trained_esn = train(esn, target_data)
+
+# Train the ESN using a custom training method
+trained_esn = train(esn, target_data, training_method=StandardRidge(1.0))
+```
 
-Training of the built ESN over the ```target_data```. The default training method is
-RidgeRegression. The output is an ```OutputLayer``` object to be fed to the esn call
-for the prediction.
+# Notes
+- When using a `Hybrid` variation, the function extends the state matrix with data from the
+    physical model included in the `variation`.
+- The training is handled by a lower-level `_train` function which takes the new state matrix
+    and performs the actual training using the specified `training_method`.
 """
 function train(esn::AbstractEchoStateNetwork,
-               target_data,
-               training_method = StandardRidge(0.0))
+        target_data,
+        training_method = StandardRidge(0.0))
     variation = esn.variation
 
     if esn.variation isa Hybrid
diff --git a/src/esn/esn_input_layers.jl b/src/esn/esn_input_layers.jl
index 4794be43..e7bb950c 100644
--- a/src/esn/esn_input_layers.jl
+++ b/src/esn/esn_input_layers.jl
@@ -8,22 +8,30 @@ end
     WeightedInput(scaling)
     WeightedInput(;scaling=0.1)
 
-Returns a weighted layer initializer object, that will produce a weighted input matrix with
-random non-zero elements drawn from [-```scaling```, ```scaling```], as described
-in [1]. The ```scaling``` factor can be given as arg or kwarg.
+Creates a `WeightedInput` layer initializer for Echo State Networks.
+This initializer generates a weighted input matrix with random non-zero
+elements distributed uniformly within the range [-`scaling`, `scaling`],
+following the approach in [^Lu].
 
-[1] Lu, Zhixin, et al. "_Reservoir observers: Model-free inference of unmeasured variables
-in chaotic systems._"
-Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
+# Parameters
+- `scaling`: The scaling factor for the weight distribution (default: 0.1).
+
+# Returns
+- A `WeightedInput` instance to be used for initializing the input layer of an ESN.
+
+Reference:
+[^Lu]: Lu, Zhixin, et al.
+    "Reservoir observers: Model-free inference of unmeasured variables in chaotic systems."
+    Chaos: An Interdisciplinary Journal of Nonlinear Science 27.4 (2017): 041102.
 """
 function WeightedLayer(; scaling = 0.1)
     return WeightedLayer(scaling)
 end
 
 function create_layer(input_layer::WeightedLayer,
-                      approx_res_size,
-                      in_size;
-                      matrix_type = Matrix{Float64})
+        approx_res_size,
+        in_size;
+        matrix_type = Matrix{Float64})
     scaling = input_layer.scaling
     res_size = Int(floor(approx_res_size / in_size) * in_size)
     layer_matrix = zeros(res_size, in_size)
@@ -31,7 +39,7 @@ function create_layer(input_layer::WeightedLayer,
 
     for i in 1:in_size
         layer_matrix[((i - 1) * q + 1):((i) * q), i] = rand(Uniform(-scaling, scaling), 1,
-                                                            q)
+            q)
     end
 
     return Adapt.adapt(matrix_type, layer_matrix)
@@ -45,10 +53,16 @@ end
     DenseLayer(scaling)
     DenseLayer(;scaling=0.1)
 
-Returns a fully connected layer initializer object, that will produce a weighted input
-matrix with random non-zero elements drawn from [-```scaling```, ```scaling```].
-The ```scaling``` factor can be given as arg or kwarg. This is the default choice in the
-```ESN``` construction.
+Creates a `DenseLayer` initializer for Echo State Networks, generating a fully connected input layer.
+The layer is initialized with random weights uniformly distributed within [-`scaling`, `scaling`].
+This scaling factor can be provided either as an argument or a keyword argument.
+The `DenseLayer` is the default input layer in `ESN` construction.
+
+# Parameters
+- `scaling`: The scaling factor for weight distribution (default: 0.1).
+
+# Returns
+- A `DenseLayer` instance for initializing the ESN's input layer.
 """
 struct DenseLayer{T} <: AbstractLayer
     scaling::T
@@ -61,13 +75,20 @@ end
 """
     create_layer(input_layer::AbstractLayer, res_size, in_size)
 
-Returns a ```res_size``` times ```in_size``` matrix layer, built according to the
-```input_layer``` constructor.
+Generates a matrix layer of size `res_size` x `in_size`, constructed according to the specifications of the `input_layer`.
+
+# Parameters
+- `input_layer`: An instance of `AbstractLayer` determining the layer construction.
+- `res_size`: The number of rows (reservoir size) for the layer.
+- `in_size`: The number of columns (input size) for the layer.
+
+# Returns
+- A matrix representing the constructed layer.
 """
 function create_layer(input_layer::DenseLayer,
-                      res_size,
-                      in_size;
-                      matrix_type = Matrix{Float64})
+        res_size,
+        in_size;
+        matrix_type = Matrix{Float64})
     scaling = input_layer.scaling
     layer_matrix = rand(Uniform(-scaling, scaling), res_size, in_size)
     return Adapt.adapt(matrix_type, layer_matrix)
@@ -78,9 +99,16 @@ end
     SparseLayer(scaling; sparsity=0.1)
     SparseLayer(;scaling=0.1, sparsity=0.1)
 
-Returns a sparsely connected layer initializer object, that will produce a random sparse
-input matrix with random non-zero elements drawn from [-```scaling```, ```scaling```] and
-given sparsity. The ```scaling``` and ```sparsity``` factors can be given as args or kwargs.
+Creates a `SparseLayer` initializer for Echo State Networks, generating a sparse input layer.
+The layer is initialized with weights distributed within [-`scaling`, `scaling`]
+and a specified `sparsity` level. Both `scaling` and `sparsity` can be set as arguments or keyword arguments.
+
+# Parameters
+- `scaling`: Scaling factor for weight distribution (default: 0.1).
+- `sparsity`: Sparsity level of the layer (default: 0.1).
+
+# Returns
+- A `SparseLayer` instance for initializing ESN's input layer with sparse connections.
 """
 struct SparseLayer{T} <: AbstractLayer
     scaling::T
@@ -96,9 +124,9 @@ function SparseLayer(scaling_arg; scaling = scaling_arg, sparsity = 0.1)
 end
 
 function create_layer(input_layer::SparseLayer,
-                      res_size,
-                      in_size;
-                      matrix_type = Matrix{Float64})
+        res_size,
+        in_size;
+        matrix_type = Matrix{Float64})
     layer_matrix = Matrix(sprand(res_size, in_size, input_layer.sparsity))
     layer_matrix = 2.0 .* (layer_matrix .- 0.5)
     replace!(layer_matrix, -1.0 => 0.0)
@@ -117,13 +145,21 @@ end
     BernoulliSample(p)
     BernoulliSample(;p=0.5)
 
-Returns a Bernoulli sign constructor for the ```MinimumLayer``` call. The ```p``` factor
-determines the probability of the result, as in the Distributions call. The value can be
-passed as an arg or kwarg. This sign weight determination for input layers is introduced
-in [1].
+Creates a `BernoulliSample` constructor for the `MinimumLayer`.
+It uses a Bernoulli distribution to determine the sign of weights in the input layer.
+The parameter `p` sets the probability of a weight being positive, as per the `Distributions` package.
+This method of sign weight determination for input layers is based on the approach in [^Rodan].
+
+# Parameters
+- `p`: Probability of a positive weight (default: 0.5).
 
-[1] Rodan, Ali, and Peter Tino. "_Minimum complexity echo state network._"
-IEEE transactions on neural networks 22.1 (2010): 131-144.
+# Returns
+- A `BernoulliSample` instance for generating sign weights in `MinimumLayer`.
+
+Reference:
+[^Rodan]: Rodan, Ali, and Peter Tino.
+    "Minimum complexity echo state network." 
+    IEEE Transactions on Neural Networks 22.1 (2010): 131-144.
 """
 function BernoulliSample(; p = 0.5)
     return BernoulliSample(p)
@@ -138,13 +174,22 @@ end
     IrrationalSample(irrational, start)
     IrrationalSample(;irrational=pi, start=1)
 
-Returns an irrational sign constructor for the ```MinimumLayer``` call. The values can be
-passed as args or kwargs. The sign of the weight is decided from the decimal expansion of
-the given ```irrational```. The first ```start``` decimal digits are thresholded at 4.5,
-then the n-th input sign will be + and - respectively.
+Creates an `IrrationalSample` constructor for the `MinimumLayer`.
+It determines the sign of weights in the input layer based on the decimal expansion of an `irrational` number.
+The `start` parameter sets the starting point in the decimal sequence.
+The signs are assigned based on the thresholding of each decimal digit against 4.5, as described in [^Rodan].
+
+# Parameters
+- `irrational`: An irrational number for weight sign determination (default: π).
+- `start`: Starting index in the decimal expansion (default: 1).
 
-[1] Rodan, Ali, and Peter Tiňo. "_Simple deterministically constructed cycle reservoirs
-with regular jumps._" Neural computation 24.7 (2012): 1822-1852.
+# Returns
+- An `IrrationalSample` instance for generating sign weights in `MinimumLayer`.
+
+Reference:
+[^Rodan]: Rodan, Ali, and Peter Tiňo.
+    "Simple deterministically constructed cycle reservoirs with regular jumps."
+    Neural Computation 24.7 (2012): 1822-1852.
 """
 function IrrationalSample(; irrational = pi, start = 1)
     return IrrationalSample(irrational, start)
@@ -160,15 +205,25 @@ end
     MinimumLayer(weight; sampling=BernoulliSample(0.5))
     MinimumLayer(;weight=0.1, sampling=BernoulliSample(0.5))
 
-Returns a fully connected layer initializer object. The matrix constructed with this
-initializer presents the same absolute weight value, decided by the ```weight``` factor.
-The sign of each entry is decided by the ```sampling``` struct. Construction detailed
-in [1] and [2].
-
-[1] Rodan, Ali, and Peter Tino. "_Minimum complexity echo state network._"
-IEEE transactions on neural networks 22.1 (2010): 131-144.
-[2] Rodan, Ali, and Peter Tiňo. "_Simple deterministically constructed cycle reservoirs
-with regular jumps._" Neural computation 24.7 (2012): 1822-1852.
+Creates a `MinimumLayer` initializer for Echo State Networks, generating a fully connected input layer.
+This layer has a uniform absolute weight value (`weight`) with the sign of each
+weight determined by the `sampling` method. This approach, as detailed in [^Rodan1] and [^Rodan2],
+allows for controlled weight distribution in the layer.
+
+# Parameters
+- `weight`: Absolute value of weights in the layer.
+- `sampling`: Method for determining the sign of weights (default: `BernoulliSample(0.5)`).
+
+# Returns
+- A `MinimumLayer` instance for initializing the ESN's input layer.
+
+References:
+[^Rodan1]: Rodan, Ali, and Peter Tino.
+    "Minimum complexity echo state network."
+    IEEE Transactions on Neural Networks 22.1 (2010): 131-144.
+[^Rodan2]: Rodan, Ali, and Peter Tiňo.
+    "Simple deterministically constructed cycle reservoirs with regular jumps."
+    Neural Computation 24.7 (2012): 1822-1852.
 """
 function MinimumLayer(weight; sampling = BernoulliSample(0.5))
     return MinimumLayer(weight, sampling)
@@ -179,9 +234,9 @@ function MinimumLayer(; weight = 0.1, sampling = BernoulliSample(0.5))
 end
 
 function create_layer(input_layer::MinimumLayer,
-                      res_size,
-                      in_size;
-                      matrix_type = Matrix{Float64})
+        res_size,
+        in_size;
+        matrix_type = Matrix{Float64})
     sampling = input_layer.sampling
     weight = input_layer.weight
     layer_matrix = create_minimum_input(sampling, res_size, in_size, weight)
@@ -234,22 +289,35 @@ end
 """
     InformedLayer(model_in_size; scaling=0.1, gamma=0.5)
 
-Returns a weighted input layer matrix, with random non-zero elements drawn from
-[-```scaling```, ```scaling```], where some γ of reservoir nodes are connected exclusively
-to the raw inputs, and the rest to the outputs of the prior knowledge model,
-as described in [1].
+Creates an `InformedLayer` initializer for Echo State Networks (ESNs) that generates
+a weighted input layer matrix. The matrix contains random non-zero elements drawn from
+the range [-```scaling```, ```scaling```]. This initializer ensures that a fraction (`gamma`)
+of reservoir nodes are exclusively connected to the raw inputs, while the rest are
+connected to the outputs of a prior knowledge model, as described in [^Pathak].
 
-[1] Jaideep Pathak et al. "_Hybrid Forecasting of Chaotic Processes: Using Machine Learning
-in Conjunction with a Knowledge-Based Model_" (2018)
+# Arguments
+- `model_in_size`: The size of the prior knowledge model's output,
+    which determines the number of columns in the input layer matrix.
+
+# Keyword Arguments
+- `scaling`: The absolute value of the weights (default: 0.1).
+- `gamma`: The fraction of reservoir nodes connected exclusively to raw inputs (default: 0.5).
+
+# Returns
+- An `InformedLayer` instance for initializing the ESN's input layer matrix.
+
+Reference:
+[^Pathak]: Jaideep Pathak et al. 
+    "Hybrid Forecasting of Chaotic Processes: Using Machine Learning in Conjunction with a Knowledge-Based Model" (2018).
 """
 function InformedLayer(model_in_size; scaling = 0.1, gamma = 0.5)
     return InformedLayer(scaling, gamma, model_in_size)
 end
 
 function create_layer(input_layer::InformedLayer,
-                      res_size,
-                      in_size;
-                      matrix_type = Matrix{Float64})
+        res_size,
+        in_size;
+        matrix_type = Matrix{Float64})
     scaling = input_layer.scaling
     state_size = in_size - input_layer.model_in_size
 
@@ -286,15 +354,18 @@ function create_layer(input_layer::InformedLayer,
 end
 
 """
-    NullLayer(model_in_size; scaling=0.1, gamma=0.5)
+    NullLayer()
+
+Creates a `NullLayer` initializer for Echo State Networks (ESNs) that generates a vector of zeros.
 
-Returns a vector of zeros.
+# Returns
+- A `NullLayer` instance for initializing the ESN's input layer matrix.
 """
 struct NullLayer <: AbstractLayer end
 
 function create_layer(input_layer::NullLayer,
-                      res_size,
-                      in_size;
-                      matrix_type = Matrix{Float64})
+        res_size,
+        in_size;
+        matrix_type = Matrix{Float64})
     return Adapt.adapt(matrix_type, zeros(res_size, in_size))
 end
diff --git a/src/esn/esn_predict.jl b/src/esn/esn_predict.jl
index 73cddd38..daa6fc34 100644
--- a/src/esn/esn_predict.jl
+++ b/src/esn/esn_predict.jl
@@ -1,10 +1,10 @@
 function obtain_esn_prediction(esn,
-                               prediction::Generative,
-                               x,
-                               output_layer,
-                               args...;
-                               initial_conditions = output_layer.last_value,
-                               save_states = false)
+        prediction::Generative,
+        x,
+        output_layer,
+        args...;
+        initial_conditions = output_layer.last_value,
+        save_states = false)
     out_size = output_layer.out_size
     training_method = output_layer.training_method
     prediction_len = prediction.prediction_len
@@ -19,7 +19,7 @@ function obtain_esn_prediction(esn,
 
     for i in 1:prediction_len
         x, x_new = next_state_prediction!(esn, x, x_new, out, out_pad, i, tmp_array,
-                                          args...)
+            args...)
         out_tmp = get_prediction(output_layer.training_method, output_layer, x_new)
         out = store_results!(output_layer.training_method, out_tmp, output, i)
         states[:, i] = x
@@ -29,12 +29,12 @@ function obtain_esn_prediction(esn,
 end
 
 function obtain_esn_prediction(esn,
-                               prediction::Predictive,
-                               x,
-                               output_layer,
-                               args...;
-                               initial_conditions = output_layer.last_value,
-                               save_states = false)
+        prediction::Predictive,
+        x,
+        output_layer,
+        args...;
+        initial_conditions = output_layer.last_value,
+        save_states = false)
     out_size = output_layer.out_size
     training_method = output_layer.training_method
     prediction_len = prediction.prediction_len
@@ -49,7 +49,7 @@ function obtain_esn_prediction(esn,
 
     for i in 1:prediction_len
         x, x_new = next_state_prediction!(esn, x, x_new, prediction.prediction_data[:, i],
-                                          out_pad, i, tmp_array, args...)
+            out_pad, i, tmp_array, args...)
         out_tmp = get_prediction(training_method, output_layer, x_new)
         out = store_results!(training_method, out_tmp, output, i)
         states[:, i] = x
@@ -61,40 +61,40 @@ end
 #prediction dispatch on esn 
 function next_state_prediction!(esn::ESN, x, x_new, out, out_pad, i, tmp_array, args...)
     return _variation_prediction!(esn.variation, esn, x, x_new, out, out_pad, i, tmp_array,
-                                  args...)
+        args...)
 end
 
 #dispatch the prediction on the esn variation
 function _variation_prediction!(variation,
-                                esn,
-                                x,
-                                x_new,
-                                out,
-                                out_pad,
-                                i,
-                                tmp_array,
-                                args...)
+        esn,
+        x,
+        x_new,
+        out,
+        out_pad,
+        i,
+        tmp_array,
+        args...)
     out_pad = pad_state!(esn.states_type, out_pad, out)
     xv = @view x[1:(esn.res_size)]
     x = next_state!(x, esn.reservoir_driver, xv, out_pad,
-                    esn.reservoir_matrix, esn.input_matrix, esn.bias_vector, tmp_array)
+        esn.reservoir_matrix, esn.input_matrix, esn.bias_vector, tmp_array)
     x_new = esn.states_type(esn.nla_type, x, out_pad)
     return x, x_new
 end
 
 function _variation_prediction!(variation::Hybrid,
-                                esn,
-                                x,
-                                x_new,
-                                out,
-                                out_pad,
-                                i,
-                                tmp_array,
-                                model_prediction_data)
+        esn,
+        x,
+        x_new,
+        out,
+        out_pad,
+        i,
+        tmp_array,
+        model_prediction_data)
     out_tmp = vcat(out, model_prediction_data[:, i])
     out_pad = pad_state!(esn.states_type, out_pad, out_tmp)
     x = next_state!(x, esn.reservoir_driver, x[1:(esn.res_size)], out_pad,
-                    esn.reservoir_matrix, esn.input_matrix, esn.bias_vector, tmp_array)
+        esn.reservoir_matrix, esn.input_matrix, esn.bias_vector, tmp_array)
     x_tmp = vcat(x, model_prediction_data[:, i])
     x_new = esn.states_type(esn.nla_type, x_tmp, out_pad)
     return x, x_new
diff --git a/src/esn/esn_reservoir_drivers.jl b/src/esn/esn_reservoir_drivers.jl
index 8590b279..6ab198d7 100644
--- a/src/esn/esn_reservoir_drivers.jl
+++ b/src/esn/esn_reservoir_drivers.jl
@@ -2,17 +2,35 @@ abstract type AbstractReservoirDriver end
 
 """
     create_states(
-        reservoir_driver::AbstractReservoirDriver, train_data, reservoir_matrix,input_matrix
+        reservoir_driver::AbstractReservoirDriver,
+        train_data,
+        washout,
+        reservoir_matrix,
+        input_matrix,
+        bias_vector
     )
 
-Return the trained ESN states according to the given driver.
+Create and return the trained Echo State Network (ESN) states according to the specified reservoir driver.
+
+# Arguments
+- `reservoir_driver::AbstractReservoirDriver`: The reservoir driver that determines how the ESN states evolve over time.
+- `train_data`: The training data used to train the ESN.
+- `washout::Int`: The number of initial time steps to discard during training to allow the reservoir dynamics to wash out the initial conditions.
+- `reservoir_matrix`: The reservoir matrix representing the dynamic, recurrent part of the ESN.
+- `input_matrix`: The input matrix that defines the connections between input features and reservoir nodes.
+- `bias_vector`: The bias vector to be added at each time step during the reservoir update.
+
+# Returns
+- A matrix of trained ESN states, where each column represents the state at a specific time step.
+
+This function is responsible for creating and returning the states of the ESN during training based on the provided training data and parameters.
 """
 function create_states(reservoir_driver::AbstractReservoirDriver,
-                       train_data,
-                       washout,
-                       reservoir_matrix,
-                       input_matrix,
-                       bias_vector)
+        train_data,
+        washout,
+        reservoir_matrix,
+        input_matrix,
+        bias_vector)
     train_len = size(train_data, 2) - washout
     res_size = size(reservoir_matrix, 1)
 
@@ -23,13 +41,13 @@ function create_states(reservoir_driver::AbstractReservoirDriver,
     for i in 1:washout
         yv = @view train_data[:, i]
         _state = next_state!(_state, reservoir_driver, _state, yv, reservoir_matrix,
-                             input_matrix, bias_vector, tmp_array)
+            input_matrix, bias_vector, tmp_array)
     end
 
     for j in 1:train_len
         yv = @view train_data[:, washout + j]
         _state = next_state!(_state, reservoir_driver, _state, yv,
-                             reservoir_matrix, input_matrix, bias_vector, tmp_array)
+            reservoir_matrix, input_matrix, bias_vector, tmp_array)
         states[:, j] = _state
     end
 
@@ -37,11 +55,11 @@ function create_states(reservoir_driver::AbstractReservoirDriver,
 end
 
 function create_states(reservoir_driver::AbstractReservoirDriver,
-                       train_data,
-                       washout,
-                       reservoir_matrix::Vector,
-                       input_matrix,
-                       bias_vector)
+        train_data,
+        washout,
+        reservoir_matrix::Vector,
+        input_matrix,
+        bias_vector)
     train_len = size(train_data, 2) - washout
     res_size = sum([size(reservoir_matrix[i], 1) for i in 1:length(reservoir_matrix)])
 
@@ -52,17 +70,17 @@ function create_states(reservoir_driver::AbstractReservoirDriver,
     for i in 1:washout
         for j in 1:length(reservoir_matrix)
             _inter_state = next_state!(_inter_state, reservoir_driver, _inter_state,
-                                       train_data[:, i],
-                                       reservoir_matrix, input_matrix, bias_vector,
-                                       tmp_array)
+                train_data[:, i],
+                reservoir_matrix, input_matrix, bias_vector,
+                tmp_array)
         end
         _state = next_state!(_state, reservoir_driver, _state, train_data[:, i],
-                             reservoir_matrix, input_matrix, bias_vector, tmp_array)
+            reservoir_matrix, input_matrix, bias_vector, tmp_array)
     end
 
     for j in 1:train_len
         _state = next_state!(_state, reservoir_driver, _state, train_data[:, washout + j],
-                             reservoir_matrix, input_matrix, bias_vector, tmp_array)
+            reservoir_matrix, input_matrix, bias_vector, tmp_array)
         states[:, j] = _state
     end
 
@@ -79,7 +97,19 @@ end
     RNN(activation_function, leaky_coefficient)
     RNN(;activation_function=tanh, leaky_coefficient=1.0)
 
-Returns a Recurrent Neural Network initializer for the ESN. This is the default choice.
+Returns a Recurrent Neural Network (RNN) initializer for the Echo State Network (ESN).
+
+# Arguments
+- `activation_function`: The activation function used in the RNN.
+- `leaky_coefficient`: The leaky coefficient used in the RNN.
+
+# Keyword Arguments
+- `activation_function`: The activation function used in the RNN. Defaults to `tanh`.
+- `leaky_coefficient`: The leaky coefficient used in the RNN. Defaults to 1.0.
+
+This function creates an RNN object with the specified activation function and leaky coefficient,
+which can be used as a reservoir driver in the ESN.
+
 """
 function RNN(; activation_function = NNlib.fast_act(tanh), leaky_coefficient = 1.0)
     RNN(activation_function, leaky_coefficient)
@@ -127,24 +157,33 @@ end
 
 """
     MRNN(activation_function, leaky_coefficient, scaling_factor)
-    MRNN(;activation_function=[tanh, sigmoid], leaky_coefficient=1.0,
+    MRNN(;activation_function=[tanh, sigmoid], leaky_coefficient=1.0, 
         scaling_factor=fill(leaky_coefficient, length(activation_function)))
 
-Returns a Multiple RNN initializer, where multiple functions are combined in a linear
-combination with chosen parameters ```scaling_factor```. The ```activation_function```
-and ```scaling_factor``` arguments must be vectors of the same size. Multiple combinations
-are possible. The implementation is based upon the double activation function idea,
-found in [1].
+Returns a Multiple RNN (MRNN) initializer for the Echo State Network (ESN), introduced in [^lun].
+
+# Arguments
+- `activation_function`: A vector of activation functions used in the MRNN.
+- `leaky_coefficient`: The leaky coefficient used in the MRNN.
+- `scaling_factor`: A vector of scaling factors for combining activation functions.
+
+# Keyword Arguments
+- `activation_function`: A vector of activation functions used in the MRNN. Defaults to `[tanh, sigmoid]`.
+- `leaky_coefficient`: The leaky coefficient used in the MRNN. Defaults to 1.0.
+- `scaling_factor`: A vector of scaling factors for combining activation functions. Defaults to an array of the same size as `activation_function` with all elements set to `leaky_coefficient`.
 
-[1] Lun, Shu-Xian, et al. "_A novel model of leaky integrator echo state network for
-time-series prediction._" Neurocomputing 159 (2015): 58-66.
+This function creates an MRNN object with the specified activation functions, leaky coefficient, and scaling factors, which can be used as a reservoir driver in the ESN.
 
+# Reference:
+[^lun]: Lun, Shu-Xian, et al. 
+    "_A novel model of leaky integrator echo state network for
+    time-series prediction._" Neurocomputing 159 (2015): 58-66.
 """
 function MRNN(
-              ;
-              activation_function = [tanh, sigmoid],
-              leaky_coefficient = 1.0,
-              scaling_factor = fill(leaky_coefficient, length(activation_function)))
+        ;
+        activation_function = [tanh, sigmoid],
+        leaky_coefficient = 1.0,
+        scaling_factor = fill(leaky_coefficient, length(activation_function)))
     @assert length(activation_function) == length(scaling_factor)
     return MRNN(activation_function, leaky_coefficient, scaling_factor)
 end
@@ -192,20 +231,28 @@ end
 """
     FullyGated()
 
-Returns a standard Gated Recurrent Unit ESN initializer, as described in [1].
+Returns a Fully Gated Recurrent Unit (FullyGated) initializer for the Echo State Network (ESN).
+
+This function creates a FullyGated object, which can be used as a reservoir driver in the ESN.
+The FullyGated variant is described in the literature reference [^cho].
+
+# Returns
+- `FullyGated`: A FullyGated reservoir driver.
+
+# Reference
+[^cho]: Cho, Kyunghyun, et al.
+    "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
+    arXiv preprint arXiv:1406.1078 (2014).
 
-[1] Cho, Kyunghyun, et al. “_Learning phrase representations using RNN encoder-decoder
-for statistical machine translation._”
-arXiv preprint arXiv:1406.1078 (2014).
 """
 struct FullyGated <: AbstractGRUVariant end
 
 """
     Minimal()
 
-Returns a minimal GRU ESN initializer as described in [1].
+Returns a minimal GRU ESN initializer as described in [^Zhou].
 
-[1] Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._"
+[^Zhou]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._" 
 International Journal of Automation and Computing 13.3 (2016): 226-234.
 """
 struct Minimal <: AbstractGRUVariant end
@@ -218,18 +265,30 @@ struct Minimal <: AbstractGRUVariant end
         bias = fill(DenseLayer(), 2),
         variant = FullyGated())
 
-Returns a Gated Recurrent Unit [1] reservoir driver.
+Returns a Gated Recurrent Unit (GRU) reservoir driver for Echo State Networks (ESNs). This driver is based on the GRU architecture [^Cho], which is designed to capture temporal dependencies in data and is commonly used in various machine learning applications.
+
+# Arguments
+- `activation_function`: An array of activation functions for the GRU layers. By default, it uses sigmoid activation functions for the update gate, reset gate, and tanh for the hidden state.
+- `inner_layer`: An array of inner layers used in the GRU architecture. By default, it uses two dense layers.
+- `reservoir`: An array of reservoir layers. By default, it uses two random sparse reservoirs.
+- `bias`: An array of bias layers for the GRU. By default, it uses two dense layers.
+- `variant`: The GRU variant to use. By default, it uses the "FullyGated" variant.
 
-[1] Cho, Kyunghyun, et al. “_Learning phrase representations using RNN encoder-decoder for
-statistical machine translation._” arXiv preprint arXiv:1406.1078 (2014).
+# Returns
+A GRUParams object containing the parameters needed for the GRU-based reservoir driver.
+
+# References
+[^Cho]: Cho, Kyunghyun, et al.
+    "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
+    arXiv preprint arXiv:1406.1078 (2014).
 """
 function GRU(
-             ;
-             activation_function = [NNlib.sigmoid, NNlib.sigmoid, tanh],
-             inner_layer = fill(DenseLayer(), 2),
-             reservoir = fill(RandSparseReservoir(0), 2),
-             bias = fill(DenseLayer(), 2),
-             variant = FullyGated())
+        ;
+        activation_function = [NNlib.sigmoid, NNlib.sigmoid, tanh],
+        inner_layer = fill(DenseLayer(), 2),
+        reservoir = fill(RandSparseReservoir(0), 2),
+        bias = fill(DenseLayer(), 2),
+        variant = FullyGated())
     return GRU(activation_function, inner_layer, reservoir, bias, variant)
 end
 
diff --git a/src/esn/esn_reservoirs.jl b/src/esn/esn_reservoirs.jl
index 434a8cda..e014e4e7 100644
--- a/src/esn/esn_reservoirs.jl
+++ b/src/esn/esn_reservoirs.jl
@@ -18,9 +18,19 @@ end
     RandSparseReservoir(res_size, radius, sparsity)
     RandSparseReservoir(res_size; radius=1.0, sparsity=0.1)
 
-Returns a random sparse reservoir initializer, that will return a matrix with given
-`sparsity` and scaled spectral radius according to `radius`. This is the default choice
-in the ```ESN``` construction.
+
+Returns a random sparse reservoir initializer, which generates a matrix of size `res_size x res_size` with the specified `sparsity` and scaled spectral radius according to `radius`. This type of reservoir initializer is commonly used in Echo State Networks (ESNs) for capturing complex temporal dependencies.
+
+# Arguments
+- `res_size`: The size of the reservoir matrix.
+- `radius`: The desired spectral radius of the reservoir. By default, it is set to 1.0.
+- `sparsity`: The sparsity level of the reservoir matrix, controlling the fraction of zero elements. By default, it is set to 0.1.
+
+# Returns
+A RandSparseReservoir object that can be used as a reservoir initializer in ESN construction.
+
+# References
+This type of reservoir initialization is a common choice in ESN construction for its ability to capture temporal dependencies in data. However, there is no specific reference associated with this function.
 """
 function RandSparseReservoir(res_size; radius = 1.0, sparsity = 0.1)
     return RandSparseReservoir(res_size, radius, sparsity)
@@ -30,12 +40,22 @@ end
     create_reservoir(reservoir::AbstractReservoir, res_size)
     create_reservoir(reservoir, args...)
 
-Given an ```AbstractReservoir` constructor and the reservoir size, it returns the
-corresponding matrix. Alternatively, it accepts a given matrix.
+Given an `AbstractReservoir` constructor and the size of the reservoir (`res_size`), this function returns the corresponding reservoir matrix. Alternatively, it accepts a pre-generated matrix.
+
+# Arguments
+- `reservoir`: An `AbstractReservoir` object or constructor.
+- `res_size`: The size of the reservoir matrix.
+- `matrix_type`: The type of the resulting matrix. By default, it is set to `Matrix{Float64}`.
+
+# Returns
+A matrix representing the reservoir, generated based on the properties of the specified `reservoir` object or constructor.
+
+# References
+The choice of reservoir initialization is crucial in Echo State Networks (ESNs) for achieving effective temporal modeling. Specific references for reservoir initialization methods may vary based on the type of reservoir used, but the practice of initializing reservoirs for ESNs is widely documented in the ESN literature.
 """
 function create_reservoir(reservoir::RandSparseReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     reservoir_matrix = Matrix(sprand(res_size, res_size, reservoir.sparsity))
     reservoir_matrix = 2.0 .* (reservoir_matrix .- 0.5)
     replace!(reservoir_matrix, -1.0 => 0.0)
@@ -73,10 +93,10 @@ struct PseudoSVDReservoir{T, C} <: AbstractReservoir
 end
 
 function PseudoSVDReservoir(res_size;
-                            max_value = 1.0,
-                            sparsity = 0.1,
-                            sorted = true,
-                            reverse_sort = false)
+        max_value = 1.0,
+        sparsity = 0.1,
+        sorted = true,
+        reverse_sort = false)
     return PseudoSVDReservoir(res_size, max_value, sparsity, sorted, reverse_sort)
 end
 
@@ -84,29 +104,40 @@ end
     PseudoSVDReservoir(max_value, sparsity, sorted, reverse_sort)
     PseudoSVDReservoir(max_value, sparsity; sorted=true, reverse_sort=false)
 
-Returns an initializer to build a sparse reservoir matrix, with given ```sparsity```
-created using SVD as described in [1].
+Returns an initializer to build a sparse reservoir matrix with the given `sparsity` by using a pseudo-SVD approach as described in [^yang].
+
+# Arguments
+- `res_size`: The size of the reservoir matrix.
+- `max_value`: The maximum absolute value of elements in the matrix.
+- `sparsity`: The desired sparsity level of the reservoir matrix.
+- `sorted`: A boolean indicating whether to sort the singular values before creating the diagonal matrix. By default, it is set to `true`.
+- `reverse_sort`: A boolean indicating whether to reverse the sorted singular values. By default, it is set to `false`.
+
+# Returns
+A PseudoSVDReservoir object that can be used as a reservoir initializer in ESN construction.
 
-[1] Yang, Cuili, et al. "_Design of polynomial echo state networks for time
-series prediction._" Neurocomputing 290 (2018): 148-160.
+# References
+This reservoir initialization method, based on a pseudo-SVD approach, is inspired by the work in [^yang], which focuses on designing polynomial echo state networks for time series prediction.
+
+[^yang]: Yang, Cuili, et al. "_Design of polynomial echo state networks for time series prediction._" Neurocomputing 290 (2018): 148-160.
 """
 function PseudoSVDReservoir(res_size, max_value, sparsity; sorted = true,
-                            reverse_sort = false)
+        reverse_sort = false)
     return PseudoSVDReservoir(res_size, max_value, sparsity, sorted, reverse_sort)
 end
 
 function create_reservoir(reservoir::PseudoSVDReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     sorted = reservoir.sorted
     reverse_sort = reservoir.reverse_sort
     reservoir_matrix = create_diag(res_size, reservoir.max_value, sorted = sorted,
-                                   reverse_sort = reverse_sort)
+        reverse_sort = reverse_sort)
     tmp_sparsity = get_sparsity(reservoir_matrix, res_size)
 
     while tmp_sparsity <= reservoir.sparsity
         reservoir_matrix *= create_qmatrix(res_size, rand(1:res_size), rand(1:res_size),
-                                           rand() * 2 - 1)
+            rand() * 2 - 1)
         tmp_sparsity = get_sparsity(reservoir_matrix, res_size)
     end
 
@@ -165,10 +196,17 @@ end
     DelayLineReservoir(res_size; weight=0.1)
 
 Returns a Delay Line Reservoir matrix constructor to obtain a deterministic reservoir as
-described in [1]. The ```weight``` can be passed as arg or kwarg, and it determines the
-absolute value of all the connections in the reservoir.
+described in [^Rodan2010].
 
-[1] Rodan, Ali, and Peter Tino. "_Minimum complexity echo state network._"
+# Arguments
+- `res_size::Int`: The size of the reservoir.
+- `weight::T`: The weight determines the absolute value of all the connections in the reservoir.
+
+# Returns
+A `DelayLineReservoir` object.
+
+# References
+[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
 IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function DelayLineReservoir(res_size; weight = 0.1)
@@ -176,8 +214,8 @@ function DelayLineReservoir(res_size; weight = 0.1)
 end
 
 function create_reservoir(reservoir::DelayLineReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     reservoir_matrix = zeros(res_size, res_size)
 
     for i in 1:(res_size - 1)
@@ -199,11 +237,20 @@ end
     DelayLineBackwardReservoir(res_size, weight, fb_weight)
     DelayLineBackwardReservoir(res_size; weight=0.1, fb_weight=0.2)
 
-Returns a Delay Line Reservoir constructor to create a matrix with Backward connections
-as described in [1]. The ```weight``` and ```fb_weight``` can be passed as either args or
-kwargs, and they determine the only absolute values of the connections in the reservoir.
+Returns a Delay Line Reservoir constructor to create a matrix with backward connections
+as described in [^Rodan2010]. The `weight` and `fb_weight` can be passed as either arguments or
+keyword arguments, and they determine the absolute values of the connections in the reservoir.
+
+# Arguments
+- `res_size::Int`: The size of the reservoir.
+- `weight::T`: The weight determines the absolute value of forward connections in the reservoir.
+- `fb_weight::T`: The `fb_weight` determines the absolute value of backward connections in the reservoir.
 
-[1] Rodan, Ali, and Peter Tino. "_Minimum complexity echo state network._"
+# Returns
+A `DelayLineBackwardReservoir` object.
+
+# References
+[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
 IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function DelayLineBackwardReservoir(res_size; weight = 0.1, fb_weight = 0.2)
@@ -211,8 +258,8 @@ function DelayLineBackwardReservoir(res_size; weight = 0.1, fb_weight = 0.2)
 end
 
 function create_reservoir(reservoir::DelayLineBackwardReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     reservoir_matrix = zeros(res_size, res_size)
 
     for i in 1:(res_size - 1)
@@ -235,10 +282,18 @@ end
     SimpleCycleReservoir(res_size; weight=0.1)
 
 Returns a Simple Cycle Reservoir constructor to build a reservoir matrix as
-described in [1]. The ```weight``` can be passed as arg or kwarg, and it determines the
+described in [^Rodan2010]. The `weight` can be passed as an argument or a keyword argument, and it determines the
 absolute value of all the connections in the reservoir.
 
-[1] Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
+# Arguments
+- `res_size::Int`: The size of the reservoir.
+- `weight::T`: The weight determines the absolute value of connections in the reservoir.
+
+# Returns
+A `SimpleCycleReservoir` object.
+
+# References
+[^Rodan2010]: Rodan, Ali, and Peter Tino. "Minimum complexity echo state network."
 IEEE transactions on neural networks 22.1 (2010): 131-144.
 """
 function SimpleCycleReservoir(res_size; weight = 0.1)
@@ -246,8 +301,8 @@ function SimpleCycleReservoir(res_size; weight = 0.1)
 end
 
 function create_reservoir(reservoir::SimpleCycleReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     reservoir_matrix = zeros(Float64, res_size, res_size)
 
     for i in 1:(res_size - 1)
@@ -272,21 +327,29 @@ end
     CycleJumpsReservoir(res_size, cycle_weight, jump_weight, jump_size)
 
 Return a Cycle Reservoir with Jumps constructor to create a reservoir matrix as described
-in [1]. The ```weight``` and ```jump_weight``` can be passed as args or kwargs, and they
-determine the absolute values of all the connections in the reservoir. The ```jump_size```
-can also be passed either as arg or kwarg, and it determines the jumps between
-```jump_weight```s.
+in [^Rodan2012]. The `cycle_weight`, `jump_weight`, and `jump_size` can be passed as arguments or keyword arguments, and they
+determine the absolute values of connections in the reservoir. The `jump_size` determines the jumps between `jump_weight`s.
 
-[1] Rodan, Ali, and Peter Tiňo. "_Simple deterministically constructed cycle reservoirs
-with regular jumps._" Neural computation 24.7 (2012): 1822-1852.
+# Arguments
+- `res_size::Int`: The size of the reservoir.
+- `cycle_weight::T`: The weight of cycle connections.
+- `jump_weight::T`: The weight of jump connections.
+- `jump_size::Int`: The number of steps between jump connections.
+
+# Returns
+A `CycleJumpsReservoir` object.
+
+# References
+[^Rodan2012]: Rodan, Ali, and Peter Tiňo. "Simple deterministically constructed cycle reservoirs
+with regular jumps." Neural computation 24.7 (2012): 1822-1852.
 """
 function CycleJumpsReservoir(res_size; cycle_weight = 0.1, jump_weight = 0.1, jump_size = 3)
     return CycleJumpsReservoir(res_size, cycle_weight, jump_weight, jump_size)
 end
 
 function create_reservoir(reservoir::CycleJumpsReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     reservoir_matrix = zeros(res_size, res_size)
 
     for i in 1:(res_size - 1)
@@ -310,12 +373,21 @@ end
 """
     NullReservoir()
 
-Return a constructor for a matrix `zeros(res_size, res_size)`.
+Return a constructor for a matrix of zeros with dimensions `res_size x res_size`.
+
+# Arguments
+- None
+
+# Returns
+A `NullReservoir` object.
+
+# References
+- None
 """
 struct NullReservoir <: AbstractReservoir end
 
 function create_reservoir(reservoir::NullReservoir,
-                          res_size;
-                          matrix_type = Matrix{Float64})
+        res_size;
+        matrix_type = Matrix{Float64})
     return Adapt.adapt(matrix_type, zeros(res_size, res_size))
 end
diff --git a/src/predict.jl b/src/predict.jl
index 51a23d53..5ec715f2 100644
--- a/src/predict.jl
+++ b/src/predict.jl
@@ -1,9 +1,9 @@
 function obtain_prediction(rc::AbstractReservoirComputer,
-                           prediction::Generative,
-                           x,
-                           output_layer,
-                           args...;
-                           initial_conditions = output_layer.last_value)
+        prediction::Generative,
+        x,
+        output_layer,
+        args...;
+        initial_conditions = output_layer.last_value)
     #x = last_state
     prediction_len = prediction.prediction_len
     train_method = output_layer.training_method
@@ -21,11 +21,11 @@ function obtain_prediction(rc::AbstractReservoirComputer,
 end
 
 function obtain_prediction(rc::AbstractReservoirComputer,
-                           prediction::Predictive,
-                           x,
-                           output_layer,
-                           args...;
-                           kwargs...)
+        prediction::Predictive,
+        x,
+        output_layer,
+        args...;
+        kwargs...)
     prediction_len = prediction.prediction_len
     train_method = output_layer.training_method
     out_size = output_layer.out_size
diff --git a/src/reca/reca.jl b/src/reca/reca.jl
index 0a54a881..1568964f 100644
--- a/src/reca/reca.jl
+++ b/src/reca/reca.jl
@@ -25,11 +25,11 @@ arXiv preprint arXiv:1410.0162 (2014).
 automata._” arXiv preprint arXiv:1703.02806 (2017).
 """
 function RECA(train_data,
-              automata;
-              generations = 8,
-              input_encoding = RandomMapping(),
-              nla_type = NLADefault(),
-              states_type = StandardStates())
+        automata;
+        generations = 8,
+        input_encoding = RandomMapping(),
+        nla_type = NLADefault(),
+        states_type = StandardStates())
     in_size = size(train_data, 1)
     #res_size = obtain_res_size(input_encoding, generations)
     state_encoding = create_encoding(input_encoding, train_data, generations)
@@ -46,11 +46,11 @@ end
 
 #predict dispatch
 function (reca::RECA)(prediction,
-                      output_layer::AbstractOutputLayer,
-                      initial_conditions = output_layer.last_value,
-                      last_state = zeros(reca.input_encoding.ca_size))
+        output_layer::AbstractOutputLayer,
+        initial_conditions = output_layer.last_value,
+        last_state = zeros(reca.input_encoding.ca_size))
     return obtain_prediction(reca, prediction, last_state, output_layer;
-                             initial_conditions = initial_conditions)
+        initial_conditions = initial_conditions)
 end
 
 function next_state_prediction!(reca::RECA, x, out, i, args...)
diff --git a/src/reca/reca_input_encodings.jl b/src/reca/reca_input_encodings.jl
index 5a3923b6..8fdd5233 100644
--- a/src/reca/reca_input_encodings.jl
+++ b/src/reca/reca_input_encodings.jl
@@ -41,7 +41,7 @@ function create_encoding(rm::RandomMapping, input_data, generations)
     states_size = generations * rm.expansion_size * rm.permutations
     ca_size = rm.expansion_size * rm.permutations
     return RandomMaps(rm.permutations, rm.expansion_size, generations, maps, states_size,
-                      ca_size)
+        ca_size)
 end
 
 function reca_create_states(rm::RandomMaps, automata, input_data)
@@ -67,9 +67,9 @@ function encoding(rm::RandomMaps, input_vector, tot_encoded_vector)
 
     for i in 1:(rm.permutations)
         new_tot_enc_vec[((i - 1) * rm.expansion_size + 1):(i * rm.expansion_size)] = single_encoding(input_vector,
-                                                                                                     new_tot_enc_vec[((i - 1) * rm.expansion_size + 1):(i * rm.expansion_size)],
-                                                                                                     rm.maps[i,
-                                                                                                             :])
+            new_tot_enc_vec[((i - 1) * rm.expansion_size + 1):(i * rm.expansion_size)],
+            rm.maps[i,
+                :])
     end
 
     return new_tot_enc_vec
diff --git a/src/states.jl b/src/states.jl
index b5344601..2aac07b0 100644
--- a/src/states.jl
+++ b/src/states.jl
@@ -16,16 +16,21 @@ end
 """
     StandardStates()
 
-No modification of the states takes place, default option.
+When this struct is employed, the states of the reservoir are not modified. It represents the default behavior 
+in scenarios where no specific state modification is required. This approach is ideal for applications 
+where the inherent dynamics of the reservoir are sufficient, and no external manipulation of the states 
+is necessary. It maintains the original state representation, ensuring that the reservoir's natural properties 
+are preserved and utilized in computations.
 """
 struct StandardStates <: AbstractStates end
 
 """
     ExtendedStates()
 
-The states are extended with the input data, for the training section, and the prediction
-data, during the prediction section. This is obtained with a vertical concatenation of the
-data and the states.
+The `ExtendedStates` struct is used to extend the reservoir states by 
+vertically concatenating the input data (during training) and the prediction data (during the prediction phase). 
+This method enriches the state representation by integrating external data, enhancing the model's capability 
+to capture and utilize complex patterns in both training and prediction stages.
 """
 struct ExtendedStates <: AbstractStates end
 
@@ -41,8 +46,12 @@ end
     PaddedStates(padding)
     PaddedStates(;padding=1.0)
 
-The states are padded with a chosen value. Usually, this value is set to one. The padding is obtained through a
-vertical concatenation of the padding value and the states.
+Creates an instance of the `PaddedStates` struct with specified padding value.
+This padding is typically set to 1.0 by default but can be customized.
+The states of the reservoir are padded by vertically concatenating this padding value,
+enhancing the dimensionality and potentially improving the performance of the reservoir computing model.
+This function is particularly useful in scenarios where adding a constant baseline to the states is necessary
+for the desired computational task.
 """
 function PaddedStates(; padding = 1.0)
     return PaddedStates(padding)
@@ -52,9 +61,12 @@ end
     PaddedExtendedStates(padding)
     PaddedExtendedStates(;padding=1.0)
 
-The states are extended with the training data or predicted data and subsequently padded with a chosen value.
-Usually, the padding value is set to one. The padding and the extension are obtained through a vertical concatenation
-of the padding value, the data, and the states.
+Constructs a `PaddedExtendedStates` struct, which first extends the reservoir states with training or prediction data,
+then pads them with a specified value (defaulting to 1.0). This process is achieved through vertical concatenation,
+combining the padding value, data, and states.
+This function is particularly useful for enhancing the reservoir's state representation in more complex scenarios,
+where both extended contextual information and consistent baseline padding are crucial for the computational
+effectiveness of the reservoir computing model.
 """
 function PaddedExtendedStates(; padding = 1.0)
     return PaddedExtendedStates(padding)
@@ -70,14 +82,20 @@ function (::ExtendedStates)(nla_type, x, y)
     return nla(nla_type, x_tmp)
 end
 
+#check matrix/vector
 function (states_type::PaddedStates)(nla_type, x, y)
-    x_tmp = vcat(fill(states_type.padding, (1, size(x, 2))), x)
+    tt = typeof(first(x))
+    x_tmp = vcat(fill(tt(states_type.padding), (1, size(x, 2))), x)
+    #x_tmp = reduce(vcat, x_tmp)
     return nla(nla_type, x_tmp)
 end
 
+#check matrix/vector
 function (states_type::PaddedExtendedStates)(nla_type, x, y)
+    tt = typeof(first(x))
     x_tmp = vcat(y, x)
-    x_tmp = vcat(fill(states_type.padding, (1, size(x, 2))), x_tmp)
+    x_tmp = vcat(fill(tt(states_type.padding), (1, size(x, 2))), x_tmp)
+    #x_tmp = reduce(vcat, x_tmp)
     return nla(nla_type, x_tmp)
 end
 
@@ -85,7 +103,12 @@ end
 """
     NLADefault()
 
-Returns the array untouched, default option.
+`NLADefault` represents the default non-linear algorithm option. 
+When used, it leaves the input array unchanged.
+This option is suitable in cases where no non-linear transformation of the data is required,
+maintaining the original state of the input array for further processing.
+It's the go-to choice for preserving the raw data integrity within the computational pipeline
+of the reservoir computing model.
 """
 struct NLADefault <: NonLinearAlgorithm end
 
@@ -95,15 +118,21 @@ end
 
 """
     NLAT1()
-Applies the \$ \\text{T}_1 \$ transformation algorithm, as defined in [1] and [2].
 
-[1] Chattopadhyay, Ashesh, et al. "_Data-driven prediction of a multi-scale Lorenz 96
-chaotic system using a hierarchy of deep learning methods: Reservoir computing,
-ANN, and RNN-LSTM._" (2019).
-
-[2] Pathak, Jaideep, et al. "_Model-free prediction of large spatiotemporally chaotic
-systems from data: A reservoir computing approach._"
-Physical review letters 120.2 (2018): 024102.
+`NLAT1` implements the T₁ transformation algorithm introduced in [^Chattopadhyay] and [^Pathak].
+The T₁ algorithm selectively squares elements of the input array,
+specifically targeting every second row. This non-linear transformation enhances certain data characteristics,
+making it a valuable tool in analyzing chaotic systems and improving the performance of reservoir computing models.
+The T₁ transformation's uniqueness lies in its selective approach, allowing for a more nuanced manipulation of the input data.
+
+References:
+[^Chattopadhyay]: Chattopadhyay, Ashesh, et al. 
+    "Data-driven prediction of a multi-scale Lorenz 96 chaotic system using a
+    hierarchy of deep learning methods: Reservoir computing, ANN, and RNN-LSTM." (2019).
+[^Pathak]: Pathak, Jaideep, et al.
+    "Model-free prediction of large spatiotemporally chaotic systems from data:
+    A reservoir computing approach."
+    Physical review letters 120.2 (2018): 024102.
 """
 struct NLAT1 <: NonLinearAlgorithm end
 
@@ -120,11 +149,18 @@ end
 
 """
     NLAT2()
-Apply the \$ \\text{T}_2 \$ transformation algorithm, as defined in [1].
 
-[1] Chattopadhyay, Ashesh, et al. "_Data-driven prediction of a multi-scale Lorenz 96
-chaotic system using a hierarchy of deep learning methods: Reservoir computing, ANN,
-and RNN-LSTM._" (2019).
+`NLAT2` implements the T₂ transformation algorithm as defined in [^Chattopadhyay].
+This transformation algorithm modifies the reservoir states by multiplying each odd-indexed
+row (starting from the second row) with the product of its two preceding rows.
+This specific approach to non-linear transformation is useful for capturing and
+enhancing complex patterns in the data, particularly beneficial in the analysis of chaotic
+systems and in improving the dynamics within reservoir computing models.
+
+Reference:
+[^Chattopadhyay]: Chattopadhyay, Ashesh, et al.
+    "Data-driven prediction of a multi-scale Lorenz 96 chaotic system using a
+    hierarchy of deep learning methods: Reservoir computing, ANN, and RNN-LSTM." (2019).
 """
 struct NLAT2 <: NonLinearAlgorithm end
 
@@ -141,11 +177,18 @@ end
 
 """
     NLAT3()
-Apply the \$ \\text{T}_3 \$ transformation algorithm, as defined in [1].
 
-[1] Chattopadhyay, Ashesh, et al. "_Data-driven prediction of a multi-scale Lorenz 96
-chaotic system using a hierarchy of deep learning methods: Reservoir computing, ANN,
-and RNN-LSTM._" (2019).
+The `NLAT3` struct implements the T₃ transformation algorithm as detailed in [^Chattopadhyay].
+This algorithm modifies the reservoir's states by multiplying each odd-indexed row
+(beginning from the second row) with the product of the immediately preceding and the
+immediately following rows. T₃'s unique approach to data transformation makes it particularly
+useful for enhancing complex data patterns, thereby improving the modeling and analysis
+capabilities within reservoir computing, especially for chaotic and dynamic systems.
+
+Reference:
+[^Chattopadhyay]: Chattopadhyay, Ashesh, et al.
+    "Data-driven prediction of a multi-scale Lorenz 96 chaotic system using a hierarchy of deep learning methods:
+    Reservoir computing, ANN, and RNN-LSTM." (2019).
 """
 struct NLAT3 <: NonLinearAlgorithm end
 
diff --git a/src/train/linear_regression.jl b/src/train/linear_regression.jl
index 1b7946dc..b1fc2c22 100644
--- a/src/train/linear_regression.jl
+++ b/src/train/linear_regression.jl
@@ -41,15 +41,15 @@ apart from the solver choice. MLJLinearModels.jl needs to be called in order
 to use these models.
 """
 function LinearModel(
-                     ; regression = LinearRegression,
-                     solver = Analytical(),
-                     regression_kwargs = (;))
+        ; regression = LinearRegression,
+        solver = Analytical(),
+        regression_kwargs = (;))
     return LinearModel(regression, solver, regression_kwargs)
 end
 
 function LinearModel(regression;
-                     solver = Analytical(),
-                     regression_kwargs = (;))
+        solver = Analytical(),
+        regression_kwargs = (;))
     return LinearModel(regression, solver, regression_kwargs)
 end
 
@@ -59,7 +59,7 @@ function _train(states, target_data, linear::LinearModel)
     for i in 1:size(target_data, 1)
         regressor = linear.regression(; fit_intercept = false, linear.regression_kwargs...)
         output_layer[i, :] = MLJLinearModels.fit(regressor, states',
-                                                 target_data[i, :], solver = linear.solver)
+            target_data[i, :], solver = linear.solver)
     end
 
     return OutputLayer(linear, output_layer, out_size, target_data[:, end])
diff --git a/test/esn/test_drivers.jl b/test/esn/test_drivers.jl
index 1739c836..db1c7b01 100644
--- a/test/esn/test_drivers.jl
+++ b/test/esn/test_drivers.jl
@@ -12,23 +12,23 @@ const training_method = StandardRidge(10e-6)
 
 Random.seed!(77)
 esn = ESN(input_data;
-          reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
-          reservoir_driver = GRU(variant = FullyGated(),
-                                 reservoir = [
-                                     RandSparseReservoir(res_size, 1.0, 0.5),
-                                     RandSparseReservoir(res_size, 1.2, 0.1),
-                                 ]))
+    reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
+    reservoir_driver = GRU(variant = FullyGated(),
+        reservoir = [
+            RandSparseReservoir(res_size, 1.0, 0.5),
+            RandSparseReservoir(res_size, 1.2, 0.1),
+        ]))
 
 output_layer = train(esn, target_data, training_method)
 output = esn(Predictive(target_data), output_layer, initial_conditions = target_data[1])
 @test mean(abs.(target_data .- output)) ./ mean(abs.(target_data)) < 0.11
 
 esn = ESN(input_data;
-          reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
-          reservoir_driver = GRU(variant = Minimal(),
-                                 reservoir = RandSparseReservoir(res_size, 1.0, 0.5),
-                                 inner_layer = DenseLayer(),
-                                 bias = DenseLayer()))
+    reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
+    reservoir_driver = GRU(variant = Minimal(),
+        reservoir = RandSparseReservoir(res_size, 1.0, 0.5),
+        inner_layer = DenseLayer(),
+        bias = DenseLayer()))
 
 output_layer = train(esn, target_data, training_method)
 output = esn(Predictive(target_data), output_layer, initial_conditions = target_data[1])
@@ -36,30 +36,30 @@ output = esn(Predictive(target_data), output_layer, initial_conditions = target_
 
 #multiple rnn
 esn = ESN(input_data;
-          reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
-          reservoir_driver = MRNN(activation_function = (tanh, sigmoid),
-                                  scaling_factor = (0.8, 0.1)))
+    reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
+    reservoir_driver = MRNN(activation_function = (tanh, sigmoid),
+        scaling_factor = (0.8, 0.1)))
 output_layer = train(esn, target_data, training_method)
 output = esn(Predictive(target_data), output_layer, initial_conditions = target_data[1])
 @test mean(abs.(target_data .- output)) ./ mean(abs.(target_data)) < 0.11
 
 #deep esn
 esn = ESN(input_data;
-          reservoir = [
-              RandSparseReservoir(res_size, 1.2, 0.1),
-              RandSparseReservoir(res_size, 1.2, 0.1),
-          ])
+    reservoir = [
+        RandSparseReservoir(res_size, 1.2, 0.1),
+        RandSparseReservoir(res_size, 1.2, 0.1),
+    ])
 output_layer = train(esn, target_data, training_method)
 output = esn(Predictive(target_data), output_layer, initial_conditions = target_data[1])
 @test mean(abs.(target_data .- output)) ./ mean(abs.(target_data)) < 0.11
 
 esn = ESN(input_data;
-          reservoir = [
-              RandSparseReservoir(res_size, 1.2, 0.1),
-              RandSparseReservoir(res_size, 1.2, 0.1),
-          ],
-          input_layer = [DenseLayer(), DenseLayer()],
-          bias = [NullLayer(), NullLayer()])
+    reservoir = [
+        RandSparseReservoir(res_size, 1.2, 0.1),
+        RandSparseReservoir(res_size, 1.2, 0.1),
+    ],
+    input_layer = [DenseLayer(), DenseLayer()],
+    bias = [NullLayer(), NullLayer()])
 output_layer = train(esn, target_data, training_method)
 output = esn(Predictive(target_data), output_layer, initial_conditions = target_data[1])
 @test mean(abs.(target_data .- output)) ./ mean(abs.(target_data)) < 0.11
diff --git a/test/esn/test_hybrid.jl b/test/esn/test_hybrid.jl
index a28a0981..4f858208 100644
--- a/test/esn/test_hybrid.jl
+++ b/test/esn/test_hybrid.jl
@@ -34,8 +34,8 @@ hybrid = Hybrid(prior_model_data_generator, u0, tspan_train, train_len)
 
 Random.seed!(77)
 esn = ESN(input_data,
-          reservoir = RandSparseReservoir(300),
-          variation = hybrid)
+    reservoir = RandSparseReservoir(300),
+    variation = hybrid)
 
 output_layer = train(esn, target_data, StandardRidge(0.3))
 
diff --git a/test/esn/test_nla.jl b/test/esn/test_nla.jl
index 2b0a7327..f5ac42f8 100644
--- a/test/esn/test_nla.jl
+++ b/test/esn/test_nla.jl
@@ -1,23 +1,23 @@
-using ReservoirComputing, Random
+using ReservoirComputing
 
-const res_size = 20
-const ts = 0.0:0.1:50.0
-const data = sin.(ts)
-const train_len = 400
-const input_data = reduce(hcat, data[1:(train_len - 1)])
-const target_data = reduce(hcat, data[2:train_len])
-const predict_len = 100
-const test = reduce(hcat, data[(train_len + 1):(train_len + predict_len)])
-const training_method = StandardRidge(10e-6)
+states = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+nla1_states = [1, 2, 9, 4, 25, 6, 49, 8, 81]
+nla2_states = [1, 2, 2, 4, 12, 6, 30, 8, 9]
+nla3_states = [1, 2, 8, 4, 24, 6, 48, 8, 9]
 
-nlas = [NLADefault(), NLAT1(), NLAT2(), NLAT3()]
+test_types = [Float64, Float32, Float16]
 
-for n in nlas
-    Random.seed!(77)
-    esn = ESN(input_data;
-              reservoir = RandSparseReservoir(res_size, 1.2, 0.1),
-              nla_type = n)
-    output_layer = train(esn, target_data, training_method)
-    output = esn(Generative(predict_len), output_layer)
-    @test maximum(abs.(test .- output)) ./ maximum(abs.(test)) < 0.1
+for tt in test_types
+    # test default
+    nla_states = ReservoirComputing.nla(NLADefault(), tt.(states))
+    @test nla_states == tt.(states)
+    # test NLAT1
+    nla_states = ReservoirComputing.nla(NLAT1(), tt.(states))
+    @test nla_states = tt.(nla1_states)
+    # test nlat2
+    nla_states = ReservoirComputing.nla(NLAT2(), tt.(states))
+    @test nla_states = tt.(nla2_states)
+    # test nlat3
+    nla_states = ReservoirComputing.nla(NLAT3(), tt.(states))
+    @test nla_states = tt.(nla3_states)
 end
diff --git a/test/esn/test_reservoirs.jl b/test/esn/test_reservoirs.jl
index 8650c1f7..ac751712 100644
--- a/test/esn/test_reservoirs.jl
+++ b/test/esn/test_reservoirs.jl
@@ -22,7 +22,7 @@ reservoir_matrix = create_reservoir(reservoir_constructor, res_size)
 @test maximum(reservoir_matrix) <= radius
 
 reservoir_constructor = PseudoSVDReservoir(res_size, max_value = radius,
-                                           sparsity = sparsity)
+    sparsity = sparsity)
 reservoir_matrix = create_reservoir(reservoir_constructor, res_size)
 @test size(reservoir_matrix) == (res_size, res_size)
 @test maximum(reservoir_matrix) <= radius
@@ -45,7 +45,7 @@ reservoir_matrix = create_reservoir(reservoir_constructor, res_size)
 @test maximum(reservoir_matrix) == weight
 
 reservoir_constructor = DelayLineBackwardReservoir(res_size, weight = weight,
-                                                   fb_weight = weight)
+    fb_weight = weight)
 reservoir_matrix = create_reservoir(reservoir_constructor, res_size)
 @test size(reservoir_matrix) == (res_size, res_size)
 @test maximum(reservoir_matrix) == weight
@@ -68,7 +68,7 @@ reservoir_matrix = create_reservoir(reservoir_constructor, res_size)
 @test maximum(reservoir_matrix) == weight
 
 reservoir_constructor = CycleJumpsReservoir(res_size, cycle_weight = weight,
-                                            jump_weight = weight, jump_size = jump_size)
+    jump_weight = weight, jump_size = jump_size)
 reservoir_matrix = create_reservoir(reservoir_constructor, res_size)
 @test size(reservoir_matrix) == (res_size, res_size)
 @test maximum(reservoir_matrix) == weight
diff --git a/test/esn/test_states.jl b/test/esn/test_states.jl
index 1b2b8d71..479d29c9 100644
--- a/test/esn/test_states.jl
+++ b/test/esn/test_states.jl
@@ -1,22 +1,46 @@
-using ReservoirComputing, Random
-
-const res_size = 20
-const ts = 0.0:0.1:50.0
-const data = sin.(ts)
-const train_len = 400
-const input_data = reduce(hcat, data[1:(train_len - 1)])
-const target_data = reduce(hcat, data[2:train_len])
-const predict_len = 100
-const test_data = reduce(hcat, data[(train_len + 1):(train_len + predict_len)])
-const training_method = StandardRidge(10e-6)
+using ReservoirComputing
+
+test_types = [Float64, Float32, Float16]
+states = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+in_data = fill(1, 3)
 
 states_types = [StandardStates, ExtendedStates, PaddedStates, PaddedExtendedStates]
 
-for t in states_types
-    Random.seed!(77)
-    esn = ESN(input_data;
-              reservoir = RandSparseReservoir(res_size, 1.2, 0.1))
-    output_layer = train(esn, target_data, training_method)
-    output = esn(Generative(predict_len), output_layer)
-    @test maximum(abs.(test_data .- output)) ./ maximum(abs.(test_data)) < 0.1
+# testing extension and padding
+for tt in test_types
+    st_states = StandardStates()(NLADefault(), tt.(states), tt.(in_data))
+    @test length(st_states) == length(states)
+    @test typeof(st_states) == typeof(tt.(states))
+
+    st_states = ExtendedStates()(NLADefault(), tt.(states), tt.(in_data))
+    @test length(st_states) == length(states) + length(in_data)
+    @test typeof(st_states) == typeof(tt.(states))
+
+    st_states = PaddedStates()(NLADefault(), tt.(states), tt.(in_data))
+    @test length(st_states) == length(states) + 1
+    @test typeof(st_states[1]) == typeof(tt.(states)[1])
+
+    st_states = PaddedExtendedStates()(NLADefault(), tt.(states), tt.(in_data))
+    @test length(st_states) == length(states) + length(in_data) + 1
+    @test typeof(st_states[1]) == typeof(tt.(states)[1])
+end
+
+## testing non linear algos
+nla1_states = [1, 2, 9, 4, 25, 6, 49, 8, 81]
+nla2_states = [1, 2, 2, 4, 12, 6, 30, 8, 9]
+nla3_states = [1, 2, 8, 4, 24, 6, 48, 8, 9]
+
+for tt in test_types
+    # test default
+    nla_states = ReservoirComputing.nla(NLADefault(), tt.(states))
+    @test nla_states == tt.(states)
+    # test NLAT1
+    nla_states = ReservoirComputing.nla(NLAT1(), tt.(states))
+    @test nla_states == tt.(nla1_states)
+    # test nlat2
+    nla_states = ReservoirComputing.nla(NLAT2(), tt.(states))
+    @test nla_states == tt.(nla2_states)
+    # test nlat3
+    nla_states = ReservoirComputing.nla(NLAT3(), tt.(states))
+    @test nla_states == tt.(nla3_states)
 end
diff --git a/test/esn/test_train.jl b/test/esn/test_train.jl
index 731ff773..e5140b97 100644
--- a/test/esn/test_train.jl
+++ b/test/esn/test_train.jl
@@ -12,7 +12,7 @@ const reg = 10e-6
 
 Random.seed!(77)
 esn = ESN(input_data;
-          reservoir = RandSparseReservoir(res_size, 1.2, 0.1))
+    reservoir = RandSparseReservoir(res_size, 1.2, 0.1))
 
 training_methods = [
     StandardRidge(regularization_coeff = reg),
@@ -24,7 +24,7 @@ training_methods = [
 for t in training_methods
     output_layer = train(esn, target_data, t)
     output = esn(Predictive(input_data), output_layer)
-    @test mean(abs.(target_data .- output)) ./ mean(abs.(target_data)) < 0.21
+    @test mean(abs.(target_data .- output)) ./ mean(abs.(target_data)) < 0.22
 end
 
 for t in training_methods
diff --git a/test/reca/test_predictive.jl b/test/reca/test_predictive.jl
index c5ca95d4..1fb682c9 100644
--- a/test/reca/test_predictive.jl
+++ b/test/reca/test_predictive.jl
@@ -6,8 +6,8 @@ const g = 6
 const rule = 90
 
 reca = RECA(input, DCA(rule);
-            generations = g,
-            input_encoding = RandomMapping(6, 10))
+    generations = g,
+    input_encoding = RandomMapping(6, 10))
 
 output_layer = train(reca, output, StandardRidge(0.001))
 prediction = reca(Predictive(input), output_layer)
diff --git a/test/runtests.jl b/test/runtests.jl
index 9b5d73f9..d0804317 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,20 +2,37 @@ using SafeTestsets
 using Test
 
 @testset "Common Utilities   " begin
-    @safetestset "Quality Assurance" begin include("qa.jl") end
-    @safetestset "States" begin include("test_states.jl") end
+    @safetestset "Quality Assurance" begin
+        include("qa.jl")
+    end
+    @safetestset "States" begin
+        include("test_states.jl")
+    end
 end
 
 @testset "Echo State Networks" begin
-    @safetestset "ESN Input Layers" begin include("esn/test_input_layers.jl") end
-    @safetestset "ESN Reservoirs" begin include("esn/test_reservoirs.jl") end
-    @safetestset "ESN States" begin include("esn/test_states.jl") end
-    @safetestset "ESN Train and Predict" begin include("esn/test_train.jl") end
-    @safetestset "ESN Drivers" begin include("esn/test_drivers.jl") end
-    @safetestset "ESN Non Linear Algos" begin include("esn/test_nla.jl") end
-    @safetestset "Hybrid ESN" begin include("esn/test_hybrid.jl") end
+    @safetestset "ESN Input Layers" begin
+        include("esn/test_input_layers.jl")
+    end
+    @safetestset "ESN Reservoirs" begin
+        include("esn/test_reservoirs.jl")
+    end
+    @safetestset "ESN States" begin
+        include("esn/test_states.jl")
+    end
+    @safetestset "ESN Train and Predict" begin
+        include("esn/test_train.jl")
+    end
+    @safetestset "ESN Drivers" begin
+        include("esn/test_drivers.jl")
+    end
+    @safetestset "Hybrid ESN" begin
+        include("esn/test_hybrid.jl")
+    end
 end
 
 @testset "CA based Reservoirs" begin
-    @safetestset "RECA" begin include("reca/test_predictive.jl") end
+    @safetestset "RECA" begin
+        include("reca/test_predictive.jl")
+    end
 end
diff --git a/test/test_states.jl b/test/test_states.jl
index becb8df5..c8808bbf 100644
--- a/test/test_states.jl
+++ b/test/test_states.jl
@@ -38,12 +38,12 @@ padded_array = states_type(NLADefault(), test_array, extension)
 states_type = PaddedExtendedStates(padding = padding)
 padded_extended_array = states_type(NLADefault(), test_array, extension)
 @test padded_extended_array == reshape(vcat(padding, extension, test_array),
-              length(test_array) + length(extension) + 1, 1)
+    length(test_array) + length(extension) + 1, 1)
 
 states_type = PaddedExtendedStates(padding)
 padded_extended_array = states_type(NLADefault(), test_array, extension)
 @test padded_extended_array == reshape(vcat(padding, extension, test_array),
-              length(test_array) + length(extension) + 1, 1)
+    length(test_array) + length(extension) + 1, 1)
 
 states_type = ExtendedStates()
 extended_array = states_type(NLADefault(), test_array, extension)