diff --git a/examples/example-000-000-overview.cc b/examples/example-000-000-overview.cc index 9bad0cc..a80a389 100644 --- a/examples/example-000-000-overview.cc +++ b/examples/example-000-000-overview.cc @@ -54,42 +54,42 @@ typedef Simulator::action_type A; // state transition. Let us use our own type to store the transition // elements. struct Transition { - S s; - A a; - Reward r; - S s_; // read s_ as s' - bool is_terminal; + S s; + A a; + Reward r; + S s_; // read s_ as s' + bool is_terminal; }; std::string string_of_action(A a) { - std::string res; - switch(a) { - case rl::problem::cliff_walking::Action::actionNorth: res = "North"; break; - case rl::problem::cliff_walking::Action::actionSouth: res = "South"; break; - case rl::problem::cliff_walking::Action::actionEast: res = "East "; break; - case rl::problem::cliff_walking::Action::actionWest: res = "West "; break; - default: res = "?????"; - } - return res; + std::string res; + switch(a) { + case rl::problem::cliff_walking::Action::actionNorth: res = "North"; break; + case rl::problem::cliff_walking::Action::actionSouth: res = "South"; break; + case rl::problem::cliff_walking::Action::actionEast: res = "East "; break; + case rl::problem::cliff_walking::Action::actionWest: res = "West "; break; + default: res = "?????"; + } + return res; } // This prints a transition. std::ostream& operator<<(std::ostream& os, const Transition& t) { - os << std::setw(3) << t.s << ' ' << string_of_action(t.a) - << " ---" << std::setw(5) << t.r << " ---> "; - if(t.is_terminal) - os << "End-of-Episode"; - else - os << std::setw(3) << t.s_; - return os; + os << std::setw(3) << t.s << ' ' << string_of_action(t.a) + << " ---" << std::setw(5) << t.r << " ---> "; + if(t.is_terminal) + os << "End-of-Episode"; + else + os << std::setw(3) << t.s_; + return os; } // This functions makes a transition from its elements. Transition make_transition(S s, A a, Reward r, S s_) { - return {s,a,r,s_,false}; + return {s,a,r,s_,false}; } Transition make_terminal_transition(S s, A a, Reward r) { - return {s,a,r,s /* unused */,true}; + return {s,a,r,s /* unused */,true}; } // Let us define the parameters. @@ -115,8 +115,8 @@ Transition make_terminal_transition(S s, A a, Reward r) { // This method simply retrives a q value from a gsl vector. double q_parametrized(const gsl_vector* theta, - S s, A a) { - return gsl_vector_get(theta,TABULAR_Q_RANK(s,a)); + S s, A a) { + return gsl_vector_get(theta,TABULAR_Q_RANK(s,a)); } // In the Q-Learning algorithm, updates are made according to the @@ -125,9 +125,9 @@ double q_parametrized(const gsl_vector* theta, // gradient is straightforward, since it is a (00..00100..00) vector // with a 1 at the (s,a) rank position. void grad_q_parametrized(const gsl_vector* theta, - gsl_vector* grad_theta_sa, - S s, A a) { - gsl_vector_set_basis(grad_theta_sa,TABULAR_Q_RANK(s,a)); + gsl_vector* grad_theta_sa, + S s, A a) { + gsl_vector_set_basis(grad_theta_sa,TABULAR_Q_RANK(s,a)); } @@ -137,107 +137,111 @@ using namespace std::placeholders; // Let us start some experiment int main(int argc, char* argv[]) { - - // We need to provide iterators for enumerating all the state and action - // values. This can be done easily from an enumerators. - auto action_begin = rl::enumerator(rl::problem::cliff_walking::Action::actionNorth); - auto action_end = action_begin + rl::problem::cliff_walking::actionSize; - auto state_begin = rl::enumerator(Cliff::start); - auto state_end = state_begin + Cliff::size; - - - // This is the dynamical system we want to control. - Param param; - Simulator simulator(param); - - // Our Q-function is determined by some vector parameter. It is a - // gsl_vector since we use the GSL-based algorithm provided by the - // library. - gsl_vector* theta = gsl_vector_alloc(TABULAR_Q_CARDINALITY); - gsl_vector_set_zero(theta); - - // If we need to use the Q-function parametrized by theta as q(s,a), - // we only have to bind our q_from_table function and get a - // functional object. - auto q = std::bind(q_parametrized,theta,_1,_2); - - // Let us now define policies, related to q. The learning policy - // used is an epsilon-greedy one in the following, while we test the - // learned Q-function with a geedy policy. - auto learning_policy = rl::policy::epsilon_greedy(q,paramEPSILON,action_begin,action_end); - auto test_policy = rl::policy::greedy(q,action_begin,action_end); - - // We intend to learn q on-line, by running episodes, and updating a - // critic fro the transition we get during the episodes. Let us use - // some GSL-based critic for that purpose. - auto critic = rl::gsl::sarsa(theta, - paramGAMMA,paramALPHA, - q_parametrized, - grad_q_parametrized); - - // We have now all the elements to start experiments. - - - // Let us run 10000 episodes with the agent that learns the Q-values. - - std::cout << "Learning " << std::endl - << std::endl; - - int episode; - for(episode = 0; episode < 10000; ++episode) { + + std::random_device rd; + std::mt19937 gen(rd()); + + // We need to provide iterators for enumerating all the state and action + // values. This can be done easily from an enumerators. + auto action_begin = rl::enumerator(rl::problem::cliff_walking::Action::actionNorth); + auto action_end = action_begin + rl::problem::cliff_walking::actionSize; + auto state_begin = rl::enumerator(Cliff::start); + auto state_end = state_begin + Cliff::size; + + + // This is the dynamical system we want to control. + Param param; + Simulator simulator(param); + + // Our Q-function is determined by some vector parameter. It is a + // gsl_vector since we use the GSL-based algorithm provided by the + // library. + gsl_vector* theta = gsl_vector_alloc(TABULAR_Q_CARDINALITY); + gsl_vector_set_zero(theta); + + // If we need to use the Q-function parametrized by theta as q(s,a), + // we only have to bind our q_from_table function and get a + // functional object. + auto q = std::bind(q_parametrized,theta,_1,_2); + + // Let us now define policies, related to q. The learning policy + // used is an epsilon-greedy one in the following, while we test the + // learned Q-function with a geedy policy. + double epsilon = paramEPSILON; + auto learning_policy = rl::policy::epsilon_greedy(q,epsilon,action_begin,action_end, gen); + auto test_policy = rl::policy::greedy(q,action_begin,action_end); + + // We intend to learn q on-line, by running episodes, and updating a + // critic fro the transition we get during the episodes. Let us use + // some GSL-based critic for that purpose. + auto critic = rl::gsl::sarsa(theta, + paramGAMMA,paramALPHA, + q_parametrized, + grad_q_parametrized); + + // We have now all the elements to start experiments. + + + // Let us run 10000 episodes with the agent that learns the Q-values. + + std::cout << "Learning " << std::endl + << std::endl; + + int episode; + for(episode = 0; episode < 10000; ++episode) { + simulator.restart(); + auto actual_episode_length = rl::episode::learn(simulator,learning_policy,critic, + 0); + if(episode % 200 == 0) + std::cout << "episode " << std::setw(5) << episode+1 + << " : length = " << std::setw(5) << actual_episode_length << std::endl; + } + std::cout << std::endl; + + // Let us print the parameters. This can be dumped in a file, rather + // than printed, for saving the learned Q-value function. + std::cout << "Learned theta : " << std::endl + << std::endl + << theta << std::endl + << std::endl; + + + // Let us define v as v(s) = max_a q(s_a) with a labda function. + auto v = [&action_begin,&action_end,&q](S s) -> double {return rl::max(std::bind(q,s,_1), + action_begin, + action_end);}; + // We can draw the Value function a image file. + auto v_range = rl::range(v,state_begin,state_end); + std::cout << std::endl + << " V in [" << v_range.first << ',' << v_range.second << "]." << std::endl + << std::endl; + Cliff::draw("V-overview",0,v,v_range.first,v_range.second); + std::cout << "Image file \"V-overview-000000.ppm\" generated." << std::endl + << std::endl; + + // Let us be greedy on the policy we have found, using the greedy + // agent to run an episode. + simulator.restart(); + unsigned int nb_steps = rl::episode::run(simulator,test_policy,0); + std::cout << "Best policy episode ended after " << nb_steps << " steps." << std::endl; + + // We can also gather the transitions from an episode into a collection. + std::vector transition_set; simulator.restart(); - auto actual_episode_length = rl::episode::learn(simulator,learning_policy,critic, - 0); - if(episode % 200 == 0) - std::cout << "episode " << std::setw(5) << episode+1 - << " : length = " << std::setw(5) << actual_episode_length << std::endl; - } - std::cout << std::endl; - - // Let us print the parameters. This can be dumped in a file, rather - // than printed, for saving the learned Q-value function. - std::cout << "Learned theta : " << std::endl - << std::endl - << theta << std::endl - << std::endl; - - - // Let us define v as v(s) = max_a q(s_a) with a labda function. - auto v = [&action_begin,&action_end,&q](S s) -> double {return rl::max(std::bind(q,s,_1), - action_begin, - action_end);}; - // We can draw the Value function a image file. - auto v_range = rl::range(v,state_begin,state_end); - std::cout << std::endl - << " V in [" << v_range.first << ',' << v_range.second << "]." << std::endl - << std::endl; - Cliff::draw("V-overview",0,v,v_range.first,v_range.second); - std::cout << "Image file \"V-overview-000000.ppm\" generated." << std::endl - << std::endl; - - // Let us be greedy on the policy we have found, using the greedy - // agent to run an episode. - simulator.restart(); - unsigned int nb_steps = rl::episode::run(simulator,test_policy,0); - std::cout << "Best policy episode ended after " << nb_steps << " steps." << std::endl; - - // We can also gather the transitions from an episode into a collection. - std::vector transition_set; - simulator.restart(); - nb_steps = rl::episode::run(simulator,test_policy, - std::back_inserter(transition_set), - make_transition,make_terminal_transition, - 0); - std::cout << std::endl - << "Collected transitions :" << std::endl - << "---------------------" << std::endl - << nb_steps << " == " << transition_set.size() << std::endl - << std::endl; - for(auto& t : transition_set) - std::cout << t << std::endl; - - - gsl_vector_free(theta); - return 0; + nb_steps = rl::episode::run(simulator,test_policy, + std::back_inserter(transition_set), + make_transition,make_terminal_transition, + 0); + std::cout << std::endl + << "Collected transitions :" << std::endl + << "---------------------" << std::endl + << nb_steps << " == " << transition_set.size() << std::endl + << std::endl; + for(auto& t : transition_set) + std::cout << t << std::endl; + + + gsl_vector_free(theta); + return 0; } diff --git a/examples/example-003-002-pendulum-mlp-ktdq.cc b/examples/example-003-002-pendulum-mlp-ktdq.cc index ceff2f9..c108f1b 100644 --- a/examples/example-003-002-pendulum-mlp-ktdq.cc +++ b/examples/example-003-002-pendulum-mlp-ktdq.cc @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) { auto q = std::bind(q_parametrized,theta,_1,_2); rl::enumerator a_begin(rl::problem::inverted_pendulum::Action::actionNone); - rl::enumerator a_end = a_begin+ rl::problem::inverted_pendulum::action_size; + rl::enumerator a_end = a_begin+ rl::problem::inverted_pendulum::actionSize; auto critic = rl::gsl::ktd_q(theta, q_parametrized, diff --git a/examples/example-003-003-mountain-car-ktdsarsa.cc b/examples/example-003-003-mountain-car-ktdsarsa.cc index 4110131..72434eb 100644 --- a/examples/example-003-003-mountain-car-ktdsarsa.cc +++ b/examples/example-003-003-mountain-car-ktdsarsa.cc @@ -174,14 +174,15 @@ void train(int nb_episodes, bool make_movie, RANDOM_GENERATOR& gen) { auto q = std::bind(q_parametrized,theta,_1,_2); - // std::array actions = {rl::problem::mountain_car::Action::actionBackward, rl::problem::mountain_car::Action::actionNone, rl::problem::mountain_car::Action::actionForward}; + // std::array actions = {rl::problem::mountain_car::Action::actionBackward, rl::problem::mountain_car::Action::actionNone, rl::problem::mountain_car::Action::actionForward}; // auto a_begin = actions.begin(); // auto a_end = actions.end(); rl::enumerator a_begin(rl::problem::mountain_car::Action::actionNone); // This MUST be the lowest value of the enum type of actions and action enum values are consecutive for mountain_car rl::enumerator a_end = a_begin+rl::problem::mountain_car::actionSize; - auto explore_agent = rl::policy::epsilon_greedy(q,paramEPSILON,a_begin,a_end, gen); + double epsilon = paramEPSILON; + auto explore_agent = rl::policy::epsilon_greedy(q,epsilon,a_begin,a_end, gen); auto greedy_agent = rl::policy::greedy(q,a_begin,a_end); auto critic = rl::gsl::ktd_sarsa(theta, diff --git a/examples/example-003-004-pendulum-onpolicy-LSPI.cc b/examples/example-003-004-pendulum-onpolicy-LSPI.cc index 4e60986..eebc56d 100644 --- a/examples/example-003-004-pendulum-onpolicy-LSPI.cc +++ b/examples/example-003-004-pendulum-onpolicy-LSPI.cc @@ -25,10 +25,10 @@ */ /* - This example shows how to use a parametric representation of the - Q-function and apply recursive LSTD-Q to estimate the Q function. The inverted pendulum problem is solved here. - LSTD-Q gathers some transitions before updating the parameter vector. Then, the parameter vector is continuously updated. -*/ + This example shows how to use a parametric representation of the + Q-function and apply recursive LSTD-Q to estimate the Q function. The inverted pendulum problem is solved here. + LSTD-Q gathers some transitions before updating the parameter vector. Then, the parameter vector is continuously updated. + */ #include #include @@ -38,13 +38,13 @@ #include #include #include -#include #include +#include using namespace std::placeholders; // This is our simulator. -typedef rl::problem::inverted_pendulum::Simulator Simulator; +using Simulator = rl::problem::inverted_pendulum::Simulator; // Definition of Reward, S, A, Transition and TransitionSet. #include "example-defs-transition.hpp" @@ -69,53 +69,55 @@ typedef rl::problem::inverted_pendulum::Simulator Reward {double res; - phi_rbf(tmp,s,a); // phi_sa = phi(s,a) - gsl_blas_ddot(th,tmp,&res); // res = th^T . phi_sa - return res;}; - - - auto q = std::bind(q_parametrized,theta,_1,_2); - - // We instantiate our LSTD-Q - //auto critic = rl::gsl::LSTDQ_Lambda(theta, paramGAMMA, paramREG, .4, NB_OF_TRANSITIONS_WARMUP, phi_rbf); - auto critic = rl::gsl::LSTDQ(theta, paramGAMMA, paramREG, NB_OF_TRANSITIONS_WARMUP, phi_rbf); - - rl::enumerator a_begin(rl::problem::inverted_pendulum::Action::actionNone); - rl::enumerator a_end = a_begin+3; - auto greedy_policy = rl::policy::greedy(q, a_begin,a_end); - - try { - - // Let us initialize the random seed. - rl::random::seed(getpid()); - - for(episode = 0 ; episode < NB_OF_EPISODES; ++episode) { - simulator.setPhase(Simulator::phase_type()); - episode_length = rl::episode::learn(simulator, - greedy_policy,critic, - MAX_EPISODE_LENGTH); - //std::cout << "\r Episode " << episode << " : " << episode_length << std::flush; - // After each episode, we test our policy for NB_OF_TESTING_EPISODES - // episodes - double cumul_episode_length = 0.0; - for(unsigned int tepi = 0 ; tepi < NB_OF_TESTING_EPISODES; ++tepi) { - simulator.setPhase(Simulator::phase_type()); - cumul_episode_length += rl::episode::run(simulator, greedy_policy, MAX_EPISODE_LENGTH); - } - std::cout << "\r Episode " << episode << " : mean length over " << NB_OF_TESTING_EPISODES << " episodes is " << cumul_episode_length/double(NB_OF_TESTING_EPISODES) << std::string(10, ' ') << std::flush; - + std::random_device rd; + std::mt19937 gen(rd()); + + int episode,episode_length; + + Simulator simulator(gen); + + gsl_vector* theta = gsl_vector_calloc(PHI_RBF_DIMENSION); + gsl_vector* tmp = gsl_vector_calloc(PHI_RBF_DIMENSION); + + auto q_parametrized = [tmp](const gsl_vector* th,S s, A a) -> Reward {double res; + phi_rbf(tmp,s,a); // phi_sa = phi(s,a) + gsl_blas_ddot(th,tmp,&res); // res = th^T . phi_sa + return res;}; + + + auto q = std::bind(q_parametrized,theta,_1,_2); + + // We instantiate our LSTD-Q + //auto critic = rl::gsl::LSTDQ_Lambda(theta, paramGAMMA, paramREG, .4, NB_OF_TRANSITIONS_WARMUP, phi_rbf); + auto critic = rl::gsl::LSTDQ(theta, paramGAMMA, paramREG, NB_OF_TRANSITIONS_WARMUP, phi_rbf); + + rl::enumerator a_begin(rl::problem::inverted_pendulum::Action::actionNone); + rl::enumerator a_end = a_begin+rl::problem::inverted_pendulum::actionSize; + auto greedy_policy = rl::policy::greedy(q, a_begin,a_end); + + Simulator::phase_type start_phase; + try { + for(episode = 0 ; episode < NB_OF_EPISODES; ++episode) { + start_phase.random(gen); + simulator.setPhase(start_phase); + episode_length = rl::episode::learn(simulator, + greedy_policy,critic, + MAX_EPISODE_LENGTH); + //std::cout << "\r Episode " << episode << " : " << episode_length << std::flush; + // After each episode, we test our policy for NB_OF_TESTING_EPISODES + // episodes + double cumul_episode_length = 0.0; + for(unsigned int tepi = 0 ; tepi < NB_OF_TESTING_EPISODES; ++tepi) { + start_phase.random(gen); + simulator.setPhase(start_phase); + cumul_episode_length += rl::episode::run(simulator, greedy_policy, MAX_EPISODE_LENGTH); + } + std::cout << "\r Episode " << episode << " : mean length over " << NB_OF_TESTING_EPISODES << " episodes is " << cumul_episode_length/double(NB_OF_TESTING_EPISODES) << std::string(10, ' ') << std::flush; + + } + } + catch(rl::exception::Any& e) { + std::cerr << "Exception caught : " << e.what() << std::endl; } - } - catch(rl::exception::Any& e) { - std::cerr << "Exception caught : " << e.what() << std::endl; - } - std::cout << std::endl; + std::cout << std::endl; } diff --git a/examples/example-004-001-cliff-onestep.cc b/examples/example-004-001-cliff-onestep.cc index 088b450..45a5d00 100644 --- a/examples/example-004-001-cliff-onestep.cc +++ b/examples/example-004-001-cliff-onestep.cc @@ -3,6 +3,8 @@ // Learner : One-step Actor-Critic #include +#include + #define NB_EPISODES 3000 @@ -19,14 +21,15 @@ using S = Simulator::observation_type; using A = Simulator::action_type; // The controller architecture -using Architecture = rl::gsl::ActorCritic::Architecture::Tabular; +using Architecture = rl::gsl::ActorCritic::Architecture::Tabular; // The algorithm to train the controller using Learner = rl::gsl::ActorCritic::Learner::OneStep; int main(int argc, char* argv[]) { - std::srand(time(0)); - + std::random_device rd; + std::mt19937 gen(rd()); + // 1) Instantiate the simulator Param param; Simulator simulator(param); @@ -37,7 +40,7 @@ int main(int argc, char* argv[]) { unsigned int nb_features = Cliff::size; Architecture archi(nb_features, [](const S& s) { return s;}, - action_begin, action_end); + action_begin, action_end, gen); // 3) Instantiate the learner Learner learner(archi, paramGAMMA, paramALPHA_V, paramALPHA_P); diff --git a/examples/example-defs-cliff-experiments.hpp b/examples/example-defs-cliff-experiments.hpp index c96c070..d43d198 100644 --- a/examples/example-defs-cliff-experiments.hpp +++ b/examples/example-defs-cliff-experiments.hpp @@ -141,7 +141,8 @@ void make_experiment(CRITIC& critic, auto action_end = action_begin + rl::problem::cliff_walking::actionSize; auto state_begin = rl::enumerator(Cliff::start); auto state_end = state_begin + Cliff::size; - auto learning_policy = rl::policy::epsilon_greedy(q,paramEPSILON, + double epsilon = paramEPSILON; + auto learning_policy = rl::policy::epsilon_greedy(q,epsilon, action_begin,action_end, gen); auto test_policy = rl::policy::greedy(q,action_begin,action_end); int episode,frame; diff --git a/src/rl-inverted-pendulum.hpp b/src/rl-inverted-pendulum.hpp index 002d0e2..ea94831 100644 --- a/src/rl-inverted-pendulum.hpp +++ b/src/rl-inverted-pendulum.hpp @@ -45,7 +45,7 @@ namespace rl { actionLeft = 1, actionRight= 2 }; - constexpr int action_size = 3; + constexpr int actionSize = 3; // some exceptions for state and action consistancy class BadAction : public rl::exception::Any { diff --git a/src/rlActorCritic.hpp b/src/rlActorCritic.hpp index 7d26eb7..8aff053 100644 --- a/src/rlActorCritic.hpp +++ b/src/rlActorCritic.hpp @@ -75,6 +75,7 @@ namespace rl { rl::enumerator _action_begin; rl::enumerator _action_end; std::function _q_function; + double temperature; std::function _policy; Actor(unsigned int nb_state_features, @@ -86,7 +87,8 @@ namespace rl { _params(gsl_vector_alloc(nb_state_features*nb_actions)), _action_begin(action_begin), _action_end(action_end), _q_function(std::bind(&Actor::q_function, std::ref(*this), std::placeholders::_1, std::placeholders::_2)), - _policy(rl::policy::softmax(_q_function, 1.0, _action_begin, _action_end, gen)){ + temperature(1.0), + _policy(rl::policy::softmax(_q_function, temperature, _action_begin, _action_end, gen)){ gsl_vector_set_zero(_params); }