Skip to content

Commit

Permalink
should be done
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremyfix committed Sep 29, 2018
1 parent 026885a commit 71c178a
Show file tree
Hide file tree
Showing 8 changed files with 206 additions and 193 deletions.
262 changes: 133 additions & 129 deletions examples/example-000-000-overview.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,42 +54,42 @@ typedef Simulator::action_type A;
// state transition. Let us use our own type to store the transition
// elements.
struct Transition {
S s;
A a;
Reward r;
S s_; // read s_ as s'
bool is_terminal;
S s;
A a;
Reward r;
S s_; // read s_ as s'
bool is_terminal;
};

std::string string_of_action(A a) {
std::string res;
switch(a) {
case rl::problem::cliff_walking::Action::actionNorth: res = "North"; break;
case rl::problem::cliff_walking::Action::actionSouth: res = "South"; break;
case rl::problem::cliff_walking::Action::actionEast: res = "East "; break;
case rl::problem::cliff_walking::Action::actionWest: res = "West "; break;
default: res = "?????";
}
return res;
std::string res;
switch(a) {
case rl::problem::cliff_walking::Action::actionNorth: res = "North"; break;
case rl::problem::cliff_walking::Action::actionSouth: res = "South"; break;
case rl::problem::cliff_walking::Action::actionEast: res = "East "; break;
case rl::problem::cliff_walking::Action::actionWest: res = "West "; break;
default: res = "?????";
}
return res;
}

// This prints a transition.
std::ostream& operator<<(std::ostream& os, const Transition& t) {
os << std::setw(3) << t.s << ' ' << string_of_action(t.a)
<< " ---" << std::setw(5) << t.r << " ---> ";
if(t.is_terminal)
os << "End-of-Episode";
else
os << std::setw(3) << t.s_;
return os;
os << std::setw(3) << t.s << ' ' << string_of_action(t.a)
<< " ---" << std::setw(5) << t.r << " ---> ";
if(t.is_terminal)
os << "End-of-Episode";
else
os << std::setw(3) << t.s_;
return os;
}

// This functions makes a transition from its elements.
Transition make_transition(S s, A a, Reward r, S s_) {
return {s,a,r,s_,false};
return {s,a,r,s_,false};
}
Transition make_terminal_transition(S s, A a, Reward r) {
return {s,a,r,s /* unused */,true};
return {s,a,r,s /* unused */,true};
}

// Let us define the parameters.
Expand All @@ -115,8 +115,8 @@ Transition make_terminal_transition(S s, A a, Reward r) {

// This method simply retrives a q value from a gsl vector.
double q_parametrized(const gsl_vector* theta,
S s, A a) {
return gsl_vector_get(theta,TABULAR_Q_RANK(s,a));
S s, A a) {
return gsl_vector_get(theta,TABULAR_Q_RANK(s,a));
}

// In the Q-Learning algorithm, updates are made according to the
Expand All @@ -125,9 +125,9 @@ double q_parametrized(const gsl_vector* theta,
// gradient is straightforward, since it is a (00..00100..00) vector
// with a 1 at the (s,a) rank position.
void grad_q_parametrized(const gsl_vector* theta,
gsl_vector* grad_theta_sa,
S s, A a) {
gsl_vector_set_basis(grad_theta_sa,TABULAR_Q_RANK(s,a));
gsl_vector* grad_theta_sa,
S s, A a) {
gsl_vector_set_basis(grad_theta_sa,TABULAR_Q_RANK(s,a));
}


Expand All @@ -137,107 +137,111 @@ using namespace std::placeholders;

// Let us start some experiment
int main(int argc, char* argv[]) {

// We need to provide iterators for enumerating all the state and action
// values. This can be done easily from an enumerators.
auto action_begin = rl::enumerator<A>(rl::problem::cliff_walking::Action::actionNorth);
auto action_end = action_begin + rl::problem::cliff_walking::actionSize;
auto state_begin = rl::enumerator<S>(Cliff::start);
auto state_end = state_begin + Cliff::size;


// This is the dynamical system we want to control.
Param param;
Simulator simulator(param);

// Our Q-function is determined by some vector parameter. It is a
// gsl_vector since we use the GSL-based algorithm provided by the
// library.
gsl_vector* theta = gsl_vector_alloc(TABULAR_Q_CARDINALITY);
gsl_vector_set_zero(theta);

// If we need to use the Q-function parametrized by theta as q(s,a),
// we only have to bind our q_from_table function and get a
// functional object.
auto q = std::bind(q_parametrized,theta,_1,_2);

// Let us now define policies, related to q. The learning policy
// used is an epsilon-greedy one in the following, while we test the
// learned Q-function with a geedy policy.
auto learning_policy = rl::policy::epsilon_greedy(q,paramEPSILON,action_begin,action_end);
auto test_policy = rl::policy::greedy(q,action_begin,action_end);

// We intend to learn q on-line, by running episodes, and updating a
// critic fro the transition we get during the episodes. Let us use
// some GSL-based critic for that purpose.
auto critic = rl::gsl::sarsa<S,A>(theta,
paramGAMMA,paramALPHA,
q_parametrized,
grad_q_parametrized);

// We have now all the elements to start experiments.


// Let us run 10000 episodes with the agent that learns the Q-values.

std::cout << "Learning " << std::endl
<< std::endl;

int episode;
for(episode = 0; episode < 10000; ++episode) {

std::random_device rd;
std::mt19937 gen(rd());

// We need to provide iterators for enumerating all the state and action
// values. This can be done easily from an enumerators.
auto action_begin = rl::enumerator<A>(rl::problem::cliff_walking::Action::actionNorth);
auto action_end = action_begin + rl::problem::cliff_walking::actionSize;
auto state_begin = rl::enumerator<S>(Cliff::start);
auto state_end = state_begin + Cliff::size;


// This is the dynamical system we want to control.
Param param;
Simulator simulator(param);

// Our Q-function is determined by some vector parameter. It is a
// gsl_vector since we use the GSL-based algorithm provided by the
// library.
gsl_vector* theta = gsl_vector_alloc(TABULAR_Q_CARDINALITY);
gsl_vector_set_zero(theta);

// If we need to use the Q-function parametrized by theta as q(s,a),
// we only have to bind our q_from_table function and get a
// functional object.
auto q = std::bind(q_parametrized,theta,_1,_2);

// Let us now define policies, related to q. The learning policy
// used is an epsilon-greedy one in the following, while we test the
// learned Q-function with a geedy policy.
double epsilon = paramEPSILON;
auto learning_policy = rl::policy::epsilon_greedy(q,epsilon,action_begin,action_end, gen);
auto test_policy = rl::policy::greedy(q,action_begin,action_end);

// We intend to learn q on-line, by running episodes, and updating a
// critic fro the transition we get during the episodes. Let us use
// some GSL-based critic for that purpose.
auto critic = rl::gsl::sarsa<S,A>(theta,
paramGAMMA,paramALPHA,
q_parametrized,
grad_q_parametrized);

// We have now all the elements to start experiments.


// Let us run 10000 episodes with the agent that learns the Q-values.

std::cout << "Learning " << std::endl
<< std::endl;

int episode;
for(episode = 0; episode < 10000; ++episode) {
simulator.restart();
auto actual_episode_length = rl::episode::learn(simulator,learning_policy,critic,
0);
if(episode % 200 == 0)
std::cout << "episode " << std::setw(5) << episode+1
<< " : length = " << std::setw(5) << actual_episode_length << std::endl;
}
std::cout << std::endl;

// Let us print the parameters. This can be dumped in a file, rather
// than printed, for saving the learned Q-value function.
std::cout << "Learned theta : " << std::endl
<< std::endl
<< theta << std::endl
<< std::endl;


// Let us define v as v(s) = max_a q(s_a) with a labda function.
auto v = [&action_begin,&action_end,&q](S s) -> double {return rl::max(std::bind(q,s,_1),
action_begin,
action_end);};
// We can draw the Value function a image file.
auto v_range = rl::range(v,state_begin,state_end);
std::cout << std::endl
<< " V in [" << v_range.first << ',' << v_range.second << "]." << std::endl
<< std::endl;
Cliff::draw("V-overview",0,v,v_range.first,v_range.second);
std::cout << "Image file \"V-overview-000000.ppm\" generated." << std::endl
<< std::endl;

// Let us be greedy on the policy we have found, using the greedy
// agent to run an episode.
simulator.restart();
unsigned int nb_steps = rl::episode::run(simulator,test_policy,0);
std::cout << "Best policy episode ended after " << nb_steps << " steps." << std::endl;

// We can also gather the transitions from an episode into a collection.
std::vector<Transition> transition_set;
simulator.restart();
auto actual_episode_length = rl::episode::learn(simulator,learning_policy,critic,
0);
if(episode % 200 == 0)
std::cout << "episode " << std::setw(5) << episode+1
<< " : length = " << std::setw(5) << actual_episode_length << std::endl;
}
std::cout << std::endl;

// Let us print the parameters. This can be dumped in a file, rather
// than printed, for saving the learned Q-value function.
std::cout << "Learned theta : " << std::endl
<< std::endl
<< theta << std::endl
<< std::endl;


// Let us define v as v(s) = max_a q(s_a) with a labda function.
auto v = [&action_begin,&action_end,&q](S s) -> double {return rl::max(std::bind(q,s,_1),
action_begin,
action_end);};
// We can draw the Value function a image file.
auto v_range = rl::range(v,state_begin,state_end);
std::cout << std::endl
<< " V in [" << v_range.first << ',' << v_range.second << "]." << std::endl
<< std::endl;
Cliff::draw("V-overview",0,v,v_range.first,v_range.second);
std::cout << "Image file \"V-overview-000000.ppm\" generated." << std::endl
<< std::endl;

// Let us be greedy on the policy we have found, using the greedy
// agent to run an episode.
simulator.restart();
unsigned int nb_steps = rl::episode::run(simulator,test_policy,0);
std::cout << "Best policy episode ended after " << nb_steps << " steps." << std::endl;

// We can also gather the transitions from an episode into a collection.
std::vector<Transition> transition_set;
simulator.restart();
nb_steps = rl::episode::run(simulator,test_policy,
std::back_inserter(transition_set),
make_transition,make_terminal_transition,
0);
std::cout << std::endl
<< "Collected transitions :" << std::endl
<< "---------------------" << std::endl
<< nb_steps << " == " << transition_set.size() << std::endl
<< std::endl;
for(auto& t : transition_set)
std::cout << t << std::endl;


gsl_vector_free(theta);
return 0;
nb_steps = rl::episode::run(simulator,test_policy,
std::back_inserter(transition_set),
make_transition,make_terminal_transition,
0);
std::cout << std::endl
<< "Collected transitions :" << std::endl
<< "---------------------" << std::endl
<< nb_steps << " == " << transition_set.size() << std::endl
<< std::endl;
for(auto& t : transition_set)
std::cout << t << std::endl;


gsl_vector_free(theta);
return 0;
}

2 changes: 1 addition & 1 deletion examples/example-003-002-pendulum-mlp-ktdq.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ int main(int argc, char* argv[]) {
auto q = std::bind(q_parametrized,theta,_1,_2);

rl::enumerator<A> a_begin(rl::problem::inverted_pendulum::Action::actionNone);
rl::enumerator<A> a_end = a_begin+ rl::problem::inverted_pendulum::action_size;
rl::enumerator<A> a_end = a_begin+ rl::problem::inverted_pendulum::actionSize;

auto critic = rl::gsl::ktd_q<S,A>(theta,
q_parametrized,
Expand Down
5 changes: 3 additions & 2 deletions examples/example-003-003-mountain-car-ktdsarsa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -174,14 +174,15 @@ void train(int nb_episodes, bool make_movie, RANDOM_GENERATOR& gen) {
auto q = std::bind(q_parametrized,theta,_1,_2);


// std::array<A, 3> actions = {rl::problem::mountain_car::Action::actionBackward, rl::problem::mountain_car::Action::actionNone, rl::problem::mountain_car::Action::actionForward};
// std::array<A, rl::problem::mountain_car::actionSize> actions = {rl::problem::mountain_car::Action::actionBackward, rl::problem::mountain_car::Action::actionNone, rl::problem::mountain_car::Action::actionForward};
// auto a_begin = actions.begin();
// auto a_end = actions.end();

rl::enumerator<A> a_begin(rl::problem::mountain_car::Action::actionNone); // This MUST be the lowest value of the enum type of actions and action enum values are consecutive for mountain_car
rl::enumerator<A> a_end = a_begin+rl::problem::mountain_car::actionSize;

auto explore_agent = rl::policy::epsilon_greedy(q,paramEPSILON,a_begin,a_end, gen);
double epsilon = paramEPSILON;
auto explore_agent = rl::policy::epsilon_greedy(q,epsilon,a_begin,a_end, gen);
auto greedy_agent = rl::policy::greedy(q,a_begin,a_end);

auto critic = rl::gsl::ktd_sarsa<S,A>(theta,
Expand Down
Loading

0 comments on commit 71c178a

Please sign in to comment.