diff --git a/examples/example-000-000-overview.cc b/examples/example-000-000-overview.cc
index 9bad0cc..a80a389 100644
--- a/examples/example-000-000-overview.cc
+++ b/examples/example-000-000-overview.cc
@@ -54,42 +54,42 @@ typedef Simulator::action_type                               A;
 // state transition. Let us use our own type to store the transition
 // elements.
 struct Transition {
-  S      s;
-  A      a;
-  Reward r;
-  S      s_; // read s_ as s'
-  bool   is_terminal;
+    S      s;
+    A      a;
+    Reward r;
+    S      s_; // read s_ as s'
+    bool   is_terminal;
 };
 
 std::string string_of_action(A a) {
-  std::string res;
-  switch(a) {
-  case rl::problem::cliff_walking::Action::actionNorth: res = "North"; break;
-  case rl::problem::cliff_walking::Action::actionSouth: res = "South"; break;
-  case rl::problem::cliff_walking::Action::actionEast:  res = "East "; break;
-  case rl::problem::cliff_walking::Action::actionWest:  res = "West "; break;
-  default:                                      res = "?????";
-  }
-  return res;
+    std::string res;
+    switch(a) {
+        case rl::problem::cliff_walking::Action::actionNorth: res = "North"; break;
+        case rl::problem::cliff_walking::Action::actionSouth: res = "South"; break;
+        case rl::problem::cliff_walking::Action::actionEast:  res = "East "; break;
+        case rl::problem::cliff_walking::Action::actionWest:  res = "West "; break;
+        default:                                      res = "?????";
+    }
+    return res;
 }
 
 // This prints a transition.
 std::ostream& operator<<(std::ostream& os, const Transition& t) {
-  os << std::setw(3) << t.s  << ' ' << string_of_action(t.a)
-     << " ---" << std::setw(5) << t.r << " ---> ";
-  if(t.is_terminal)
-    os << "End-of-Episode";
-  else
-    os << std::setw(3) << t.s_;
-  return os;
+    os << std::setw(3) << t.s  << ' ' << string_of_action(t.a)
+        << " ---" << std::setw(5) << t.r << " ---> ";
+    if(t.is_terminal)
+        os << "End-of-Episode";
+    else
+        os << std::setw(3) << t.s_;
+    return os;
 }
 
 // This functions makes a transition from its elements.
 Transition make_transition(S s, A a, Reward r, S s_) {
-  return {s,a,r,s_,false};
+    return {s,a,r,s_,false};
 }
 Transition make_terminal_transition(S s, A a, Reward r) {
-  return {s,a,r,s /* unused */,true};
+    return {s,a,r,s /* unused */,true};
 }
 
 // Let us define the parameters.
@@ -115,8 +115,8 @@ Transition make_terminal_transition(S s, A a, Reward r) {
 
 // This method simply retrives a q value from a gsl vector.
 double q_parametrized(const gsl_vector* theta,
-		      S s, A a) { 
-  return gsl_vector_get(theta,TABULAR_Q_RANK(s,a));
+        S s, A a) { 
+    return gsl_vector_get(theta,TABULAR_Q_RANK(s,a));
 }
 
 // In the Q-Learning algorithm, updates are made according to the
@@ -125,9 +125,9 @@ double q_parametrized(const gsl_vector* theta,
 // gradient is straightforward, since it is a (00..00100..00) vector
 // with a 1 at the (s,a) rank position.
 void grad_q_parametrized(const gsl_vector* theta,   
-			 gsl_vector* grad_theta_sa,
-			 S s, A a) {
-  gsl_vector_set_basis(grad_theta_sa,TABULAR_Q_RANK(s,a));
+        gsl_vector* grad_theta_sa,
+        S s, A a) {
+    gsl_vector_set_basis(grad_theta_sa,TABULAR_Q_RANK(s,a));
 }
 
 
@@ -137,107 +137,111 @@ using namespace std::placeholders;
 
 // Let us start some experiment
 int main(int argc, char* argv[]) {
-  
-  // We need to provide iterators for enumerating all the state and action
-  // values. This can be done easily from an enumerators.
-  auto action_begin = rl::enumerator<A>(rl::problem::cliff_walking::Action::actionNorth);
-  auto action_end   = action_begin + rl::problem::cliff_walking::actionSize;
-  auto state_begin  = rl::enumerator<S>(Cliff::start);
-  auto state_end    = state_begin + Cliff::size;
-
-
-  // This is the dynamical system we want to control.
-  Param      param;
-  Simulator  simulator(param);            
-
-  // Our Q-function is determined by some vector parameter. It is a
-  // gsl_vector since we use the GSL-based algorithm provided by the
-  // library.
-  gsl_vector* theta = gsl_vector_alloc(TABULAR_Q_CARDINALITY);
-  gsl_vector_set_zero(theta);
-  
-  // If we need to use the Q-function parametrized by theta as q(s,a),
-  // we only have to bind our q_from_table function and get a
-  // functional object.
-  auto q = std::bind(q_parametrized,theta,_1,_2);
-
-  // Let us now define policies, related to q. The learning policy
-  // used is an epsilon-greedy one in the following, while we test the
-  // learned Q-function with a geedy policy.
-  auto learning_policy = rl::policy::epsilon_greedy(q,paramEPSILON,action_begin,action_end);
-  auto test_policy     = rl::policy::greedy(q,action_begin,action_end);
-
-  // We intend to learn q on-line, by running episodes, and updating a
-  // critic fro the transition we get during the episodes. Let us use
-  // some GSL-based critic for that purpose.
-  auto critic = rl::gsl::sarsa<S,A>(theta,
-				    paramGAMMA,paramALPHA,
-				    q_parametrized,
-				    grad_q_parametrized);
-
-  // We have now all the elements to start experiments.
-
-
-  // Let us run 10000 episodes with the agent that learns the Q-values.
-
-  std::cout << "Learning " << std::endl
-	    << std::endl;
-
-  int episode;
-  for(episode = 0; episode < 10000; ++episode) {
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    // We need to provide iterators for enumerating all the state and action
+    // values. This can be done easily from an enumerators.
+    auto action_begin = rl::enumerator<A>(rl::problem::cliff_walking::Action::actionNorth);
+    auto action_end   = action_begin + rl::problem::cliff_walking::actionSize;
+    auto state_begin  = rl::enumerator<S>(Cliff::start);
+    auto state_end    = state_begin + Cliff::size;
+
+
+    // This is the dynamical system we want to control.
+    Param      param;
+    Simulator  simulator(param);            
+
+    // Our Q-function is determined by some vector parameter. It is a
+    // gsl_vector since we use the GSL-based algorithm provided by the
+    // library.
+    gsl_vector* theta = gsl_vector_alloc(TABULAR_Q_CARDINALITY);
+    gsl_vector_set_zero(theta);
+
+    // If we need to use the Q-function parametrized by theta as q(s,a),
+    // we only have to bind our q_from_table function and get a
+    // functional object.
+    auto q = std::bind(q_parametrized,theta,_1,_2);
+
+    // Let us now define policies, related to q. The learning policy
+    // used is an epsilon-greedy one in the following, while we test the
+    // learned Q-function with a geedy policy.
+    double epsilon       = paramEPSILON;
+    auto learning_policy = rl::policy::epsilon_greedy(q,epsilon,action_begin,action_end, gen);
+    auto test_policy     = rl::policy::greedy(q,action_begin,action_end);
+
+    // We intend to learn q on-line, by running episodes, and updating a
+    // critic fro the transition we get during the episodes. Let us use
+    // some GSL-based critic for that purpose.
+    auto critic = rl::gsl::sarsa<S,A>(theta,
+            paramGAMMA,paramALPHA,
+            q_parametrized,
+            grad_q_parametrized);
+
+    // We have now all the elements to start experiments.
+
+
+    // Let us run 10000 episodes with the agent that learns the Q-values.
+
+    std::cout << "Learning " << std::endl
+        << std::endl;
+
+    int episode;
+    for(episode = 0; episode < 10000; ++episode) {
+        simulator.restart();
+        auto actual_episode_length = rl::episode::learn(simulator,learning_policy,critic,
+                0);
+        if(episode % 200 == 0)
+            std::cout << "episode " << std::setw(5) << episode+1 
+                << " : length = " << std::setw(5) << actual_episode_length << std::endl;
+    }
+    std::cout << std::endl;
+
+    // Let us print the parameters. This can be dumped in a file, rather
+    // than printed, for saving the learned Q-value function.
+    std::cout << "Learned theta : " << std::endl
+        << std::endl
+        << theta << std::endl
+        << std::endl;
+
+
+    // Let us define v as v(s) = max_a q(s_a) with a labda function.
+    auto v = [&action_begin,&action_end,&q](S s) -> double {return rl::max(std::bind(q,s,_1),
+            action_begin,
+            action_end);};
+    // We can draw the Value function a image file.
+    auto v_range = rl::range(v,state_begin,state_end); 
+    std::cout << std::endl
+        << " V in [" << v_range.first << ',' << v_range.second << "]." << std::endl
+        << std::endl;
+    Cliff::draw("V-overview",0,v,v_range.first,v_range.second);
+    std::cout << "Image file \"V-overview-000000.ppm\" generated." << std::endl
+        << std::endl;
+
+    // Let us be greedy on the policy we have found, using the greedy
+    // agent to run an episode.
+    simulator.restart();
+    unsigned int nb_steps = rl::episode::run(simulator,test_policy,0);
+    std::cout << "Best policy episode ended after " << nb_steps << " steps." << std::endl;
+
+    // We can also gather the transitions from an episode into a collection.
+    std::vector<Transition> transition_set;
     simulator.restart();
-    auto actual_episode_length = rl::episode::learn(simulator,learning_policy,critic,
-						    0);
-    if(episode % 200 == 0)
-      std::cout << "episode " << std::setw(5) << episode+1 
-		<< " : length = " << std::setw(5) << actual_episode_length << std::endl;
-  }
-  std::cout << std::endl;
-
-  // Let us print the parameters. This can be dumped in a file, rather
-  // than printed, for saving the learned Q-value function.
-  std::cout << "Learned theta : " << std::endl
-	    << std::endl
-	    << theta << std::endl
-	    << std::endl;
-
-
-  // Let us define v as v(s) = max_a q(s_a) with a labda function.
-  auto v = [&action_begin,&action_end,&q](S s) -> double {return rl::max(std::bind(q,s,_1),
-									 action_begin,
-									 action_end);};
-  // We can draw the Value function a image file.
-  auto v_range = rl::range(v,state_begin,state_end); 
-  std::cout << std::endl
-	    << " V in [" << v_range.first << ',' << v_range.second << "]." << std::endl
-	    << std::endl;
-  Cliff::draw("V-overview",0,v,v_range.first,v_range.second);
-  std::cout << "Image file \"V-overview-000000.ppm\" generated." << std::endl
-	    << std::endl;
-
-  // Let us be greedy on the policy we have found, using the greedy
-  // agent to run an episode.
-  simulator.restart();
-  unsigned int nb_steps = rl::episode::run(simulator,test_policy,0);
-  std::cout << "Best policy episode ended after " << nb_steps << " steps." << std::endl;
-
-  // We can also gather the transitions from an episode into a collection.
-  std::vector<Transition> transition_set;
-  simulator.restart();
-  nb_steps = rl::episode::run(simulator,test_policy,
-			      std::back_inserter(transition_set),
-			      make_transition,make_terminal_transition,
-			      0);
-  std::cout << std::endl
-	    << "Collected transitions :" << std::endl
-	    << "---------------------" << std::endl
-	    << nb_steps << " == " << transition_set.size() << std::endl
-	    << std::endl;
-  for(auto& t : transition_set)
-    std::cout << t << std::endl;
-
-
-  gsl_vector_free(theta);
-  return 0;
+    nb_steps = rl::episode::run(simulator,test_policy,
+            std::back_inserter(transition_set),
+            make_transition,make_terminal_transition,
+            0);
+    std::cout << std::endl
+        << "Collected transitions :" << std::endl
+        << "---------------------" << std::endl
+        << nb_steps << " == " << transition_set.size() << std::endl
+        << std::endl;
+    for(auto& t : transition_set)
+        std::cout << t << std::endl;
+
+
+    gsl_vector_free(theta);
+    return 0;
 }
 
diff --git a/examples/example-003-002-pendulum-mlp-ktdq.cc b/examples/example-003-002-pendulum-mlp-ktdq.cc
index ceff2f9..c108f1b 100644
--- a/examples/example-003-002-pendulum-mlp-ktdq.cc
+++ b/examples/example-003-002-pendulum-mlp-ktdq.cc
@@ -109,7 +109,7 @@ int main(int argc, char* argv[]) {
   auto q = std::bind(q_parametrized,theta,_1,_2);
 
   rl::enumerator<A> a_begin(rl::problem::inverted_pendulum::Action::actionNone);
-  rl::enumerator<A> a_end = a_begin+ rl::problem::inverted_pendulum::action_size;
+  rl::enumerator<A> a_end = a_begin+ rl::problem::inverted_pendulum::actionSize;
 
   auto critic = rl::gsl::ktd_q<S,A>(theta,
 				    q_parametrized,
diff --git a/examples/example-003-003-mountain-car-ktdsarsa.cc b/examples/example-003-003-mountain-car-ktdsarsa.cc
index 4110131..72434eb 100644
--- a/examples/example-003-003-mountain-car-ktdsarsa.cc
+++ b/examples/example-003-003-mountain-car-ktdsarsa.cc
@@ -174,14 +174,15 @@ void train(int nb_episodes, bool make_movie, RANDOM_GENERATOR& gen) {
     auto q = std::bind(q_parametrized,theta,_1,_2);
 
 
-    // std::array<A, 3> actions = {rl::problem::mountain_car::Action::actionBackward, rl::problem::mountain_car::Action::actionNone, rl::problem::mountain_car::Action::actionForward};
+    // std::array<A, rl::problem::mountain_car::actionSize> actions = {rl::problem::mountain_car::Action::actionBackward, rl::problem::mountain_car::Action::actionNone, rl::problem::mountain_car::Action::actionForward};
     // auto a_begin = actions.begin();
     // auto a_end = actions.end();
 
     rl::enumerator<A> a_begin(rl::problem::mountain_car::Action::actionNone); // This MUST be the lowest value of the enum type of actions and action enum values are consecutive for mountain_car
     rl::enumerator<A> a_end = a_begin+rl::problem::mountain_car::actionSize;
 
-    auto explore_agent = rl::policy::epsilon_greedy(q,paramEPSILON,a_begin,a_end, gen);
+    double     epsilon = paramEPSILON;
+    auto explore_agent = rl::policy::epsilon_greedy(q,epsilon,a_begin,a_end, gen);
     auto greedy_agent  = rl::policy::greedy(q,a_begin,a_end);
 
     auto critic = rl::gsl::ktd_sarsa<S,A>(theta,
diff --git a/examples/example-003-004-pendulum-onpolicy-LSPI.cc b/examples/example-003-004-pendulum-onpolicy-LSPI.cc
index 4e60986..eebc56d 100644
--- a/examples/example-003-004-pendulum-onpolicy-LSPI.cc
+++ b/examples/example-003-004-pendulum-onpolicy-LSPI.cc
@@ -25,10 +25,10 @@
  */
 
 /*
-  This example shows how to use a parametric representation of the
-  Q-function and apply recursive LSTD-Q to estimate the Q function. The inverted pendulum problem is solved here.
-  LSTD-Q gathers some transitions before updating the parameter vector. Then, the parameter vector is continuously updated.
-*/
+   This example shows how to use a parametric representation of the
+   Q-function and apply recursive LSTD-Q to estimate the Q function. The inverted pendulum problem is solved here.
+   LSTD-Q gathers some transitions before updating the parameter vector. Then, the parameter vector is continuously updated.
+   */
 
 #include <rl.hpp>
 #include <iostream>
@@ -38,13 +38,13 @@
 #include <gsl/gsl_blas.h>
 #include <cmath>
 #include <cstdlib>
-#include <unistd.h>
 #include <functional>
+#include <random>
 
 using namespace std::placeholders;
 
 // This is our simulator.
-typedef rl::problem::inverted_pendulum::Simulator<rl::problem::inverted_pendulum::DefaultParam> Simulator;
+using Simulator = rl::problem::inverted_pendulum::Simulator<rl::problem::inverted_pendulum::DefaultParam, std::mt19937>;
 
 // Definition of Reward, S, A, Transition and TransitionSet.
 #include "example-defs-transition.hpp"
@@ -69,53 +69,55 @@ typedef rl::problem::inverted_pendulum::Simulator<rl::problem::inverted_pendulum
 
 int main(int argc, char* argv[]) {
 
-  int             episode,episode_length;
-  
-  Simulator       simulator;
-  
-  gsl_vector* theta = gsl_vector_calloc(PHI_RBF_DIMENSION);
-  gsl_vector* tmp = gsl_vector_calloc(PHI_RBF_DIMENSION);
-
-  auto q_parametrized = [tmp](const gsl_vector* th,S s, A a) -> Reward {double res;
-									phi_rbf(tmp,s,a);           // phi_sa = phi(s,a)
-									gsl_blas_ddot(th,tmp,&res); // res    = th^T  . phi_sa
-									return res;};
-
-
-  auto q = std::bind(q_parametrized,theta,_1,_2);
-
-  // We instantiate our LSTD-Q
-  //auto critic = rl::gsl::LSTDQ_Lambda<S, A>(theta, paramGAMMA, paramREG, .4, NB_OF_TRANSITIONS_WARMUP, phi_rbf);
-  auto critic = rl::gsl::LSTDQ<S, A>(theta, paramGAMMA, paramREG, NB_OF_TRANSITIONS_WARMUP, phi_rbf);
-  
-  rl::enumerator<A> a_begin(rl::problem::inverted_pendulum::Action::actionNone);
-  rl::enumerator<A> a_end = a_begin+3;
-  auto greedy_policy  = rl::policy::greedy(q, a_begin,a_end);
-
-  try {
-    
-    // Let us initialize the random seed.
-    rl::random::seed(getpid());
-    
-    for(episode = 0 ; episode < NB_OF_EPISODES; ++episode) {
-      simulator.setPhase(Simulator::phase_type());
-      episode_length = rl::episode::learn(simulator,
-					  greedy_policy,critic,
-					  MAX_EPISODE_LENGTH);
-      //std::cout << "\r Episode " << episode << " : " << episode_length << std::flush;
-      // After each episode, we test our policy for NB_OF_TESTING_EPISODES
-      // episodes
-      double cumul_episode_length = 0.0;
-      for(unsigned int tepi = 0 ; tepi < NB_OF_TESTING_EPISODES; ++tepi) {
-	simulator.setPhase(Simulator::phase_type());
-	cumul_episode_length += rl::episode::run(simulator, greedy_policy, MAX_EPISODE_LENGTH);
-      }
-      std::cout << "\r Episode " << episode << " : mean length over " << NB_OF_TESTING_EPISODES << " episodes is " << cumul_episode_length/double(NB_OF_TESTING_EPISODES) << std::string(10, ' ') << std::flush;
-      
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
+    int             episode,episode_length;
+
+    Simulator       simulator(gen);
+
+    gsl_vector* theta = gsl_vector_calloc(PHI_RBF_DIMENSION);
+    gsl_vector* tmp = gsl_vector_calloc(PHI_RBF_DIMENSION);
+
+    auto q_parametrized = [tmp](const gsl_vector* th,S s, A a) -> Reward {double res;
+        phi_rbf(tmp,s,a);           // phi_sa = phi(s,a)
+        gsl_blas_ddot(th,tmp,&res); // res    = th^T  . phi_sa
+        return res;};
+
+
+    auto q = std::bind(q_parametrized,theta,_1,_2);
+
+    // We instantiate our LSTD-Q
+    //auto critic = rl::gsl::LSTDQ_Lambda<S, A>(theta, paramGAMMA, paramREG, .4, NB_OF_TRANSITIONS_WARMUP, phi_rbf);
+    auto critic = rl::gsl::LSTDQ<S, A>(theta, paramGAMMA, paramREG, NB_OF_TRANSITIONS_WARMUP, phi_rbf);
+
+    rl::enumerator<A> a_begin(rl::problem::inverted_pendulum::Action::actionNone);
+    rl::enumerator<A> a_end = a_begin+rl::problem::inverted_pendulum::actionSize;
+    auto greedy_policy  = rl::policy::greedy(q, a_begin,a_end);
+
+    Simulator::phase_type start_phase;
+    try {
+        for(episode = 0 ; episode < NB_OF_EPISODES; ++episode) {
+            start_phase.random(gen);
+            simulator.setPhase(start_phase);
+            episode_length = rl::episode::learn(simulator,
+                    greedy_policy,critic,
+                    MAX_EPISODE_LENGTH);
+            //std::cout << "\r Episode " << episode << " : " << episode_length << std::flush;
+            // After each episode, we test our policy for NB_OF_TESTING_EPISODES
+            // episodes
+            double cumul_episode_length = 0.0;
+            for(unsigned int tepi = 0 ; tepi < NB_OF_TESTING_EPISODES; ++tepi) {
+                start_phase.random(gen);
+                simulator.setPhase(start_phase);
+                cumul_episode_length += rl::episode::run(simulator, greedy_policy, MAX_EPISODE_LENGTH);
+            }
+            std::cout << "\r Episode " << episode << " : mean length over " << NB_OF_TESTING_EPISODES << " episodes is " << cumul_episode_length/double(NB_OF_TESTING_EPISODES) << std::string(10, ' ') << std::flush;
+
+        }
+    }
+    catch(rl::exception::Any& e) {
+        std::cerr << "Exception caught : " << e.what() << std::endl; 
     }
-  }
-  catch(rl::exception::Any& e) {
-    std::cerr << "Exception caught : " << e.what() << std::endl; 
-  }
-  std::cout << std::endl;
+    std::cout << std::endl;
 }
diff --git a/examples/example-004-001-cliff-onestep.cc b/examples/example-004-001-cliff-onestep.cc
index 088b450..45a5d00 100644
--- a/examples/example-004-001-cliff-onestep.cc
+++ b/examples/example-004-001-cliff-onestep.cc
@@ -3,6 +3,8 @@
 // Learner : One-step Actor-Critic
 
 #include <rl.hpp>
+#include <random>
+ 
 
 #define NB_EPISODES    3000
 
@@ -19,14 +21,15 @@ using S = Simulator::observation_type;
 using A = Simulator::action_type;
 
 // The controller architecture
-using Architecture = rl::gsl::ActorCritic::Architecture::Tabular<S, A>;
+using Architecture = rl::gsl::ActorCritic::Architecture::Tabular<S, A, std::mt19937>;
 
 // The algorithm to train the controller
 using Learner      = rl::gsl::ActorCritic::Learner::OneStep<Architecture>;
 
 int main(int argc, char* argv[]) {
-  std::srand(time(0));
-  
+    std::random_device rd;
+    std::mt19937 gen(rd());
+
   // 1) Instantiate the simulator
   Param param;
   Simulator simulator(param);
@@ -37,7 +40,7 @@ int main(int argc, char* argv[]) {
   unsigned int nb_features = Cliff::size;
   Architecture archi(nb_features,
 		     [](const S& s) { return s;},
-		     action_begin, action_end);
+		     action_begin, action_end, gen);
 
   // 3) Instantiate the learner
   Learner learner(archi, paramGAMMA, paramALPHA_V, paramALPHA_P);
diff --git a/examples/example-defs-cliff-experiments.hpp b/examples/example-defs-cliff-experiments.hpp
index c96c070..d43d198 100644
--- a/examples/example-defs-cliff-experiments.hpp
+++ b/examples/example-defs-cliff-experiments.hpp
@@ -141,7 +141,8 @@ void make_experiment(CRITIC& critic,
   auto          action_end       = action_begin + rl::problem::cliff_walking::actionSize;
   auto          state_begin      = rl::enumerator<S>(Cliff::start);
   auto          state_end        = state_begin + Cliff::size;
-  auto          learning_policy  = rl::policy::epsilon_greedy(q,paramEPSILON,
+  double        epsilon          = paramEPSILON;
+  auto          learning_policy  = rl::policy::epsilon_greedy(q,epsilon,
 							      action_begin,action_end, gen);
   auto          test_policy      = rl::policy::greedy(q,action_begin,action_end);
   int           episode,frame;
diff --git a/src/rl-inverted-pendulum.hpp b/src/rl-inverted-pendulum.hpp
index 002d0e2..ea94831 100644
--- a/src/rl-inverted-pendulum.hpp
+++ b/src/rl-inverted-pendulum.hpp
@@ -45,7 +45,7 @@ namespace rl {
                 actionLeft = 1,
                 actionRight= 2
             };
-            constexpr int action_size = 3;
+            constexpr int actionSize = 3;
 
             // some exceptions for state and action consistancy
             class BadAction : public rl::exception::Any {
diff --git a/src/rlActorCritic.hpp b/src/rlActorCritic.hpp
index 7d26eb7..8aff053 100644
--- a/src/rlActorCritic.hpp
+++ b/src/rlActorCritic.hpp
@@ -75,6 +75,7 @@ namespace rl {
                                     rl::enumerator<action_type> _action_begin;
                                     rl::enumerator<action_type> _action_end;
                                     std::function<double(unsigned int, action_type)> _q_function;
+                                    double temperature;
                                     std::function<action_type(const state_type&)> _policy;
 
                                     Actor(unsigned int nb_state_features,
@@ -86,7 +87,8 @@ namespace rl {
                                         _params(gsl_vector_alloc(nb_state_features*nb_actions)),
                                         _action_begin(action_begin), _action_end(action_end),
                                         _q_function(std::bind(&Actor::q_function, std::ref(*this), std::placeholders::_1, std::placeholders::_2)),
-                                        _policy(rl::policy::softmax(_q_function, 1.0, _action_begin, _action_end, gen)){
+                                        temperature(1.0), 
+                                        _policy(rl::policy::softmax(_q_function, temperature, _action_begin, _action_end, gen)){
                                             gsl_vector_set_zero(_params);
                                         }