Skip to content

Commit

Permalink
catch all for gathering run_stats more reliably.
Browse files Browse the repository at this point in the history
  • Loading branch information
hiive committed Nov 23, 2019
1 parent 89c9573 commit aacab34
Showing 1 changed file with 33 additions and 30 deletions.
63 changes: 33 additions & 30 deletions hiive/mdptoolbox/mdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@
import hiive.mdptoolbox.util as _util

_MSG_STOP_MAX_ITER = "Iterating stopped due to maximum number of iterations " \
"condition."
"condition."
_MSG_STOP_EPSILON_OPTIMAL_POLICY = "Iterating stopped, epsilon-optimal " \
"policy found."
"policy found."
_MSG_STOP_EPSILON_OPTIMAL_VALUE = "Iterating stopped, epsilon-optimal value " \
"function found."
"function found."
_MSG_STOP_UNCHANGING_POLICY = "Iterating stopped, unchanging policy found."


Expand All @@ -94,7 +94,6 @@ def _printVerbosity(iteration, variation):


class MDP:

"""A Markov Decision Problem.
Let ``S`` = the number of states, and ``A`` = the number of acions.
Expand Down Expand Up @@ -234,7 +233,7 @@ def __repr__(self):
for aa in range(self.A):
P_repr += repr(self.P[aa]) + "\n"
R_repr += repr(self.R[aa]) + "\n"
return(P_repr + "\n" + R_repr)
return (P_repr + "\n" + R_repr)

def _bellmanOperator(self, V=None):
# Apply the Bellman operator on the value function.
Expand All @@ -252,7 +251,7 @@ def _bellmanOperator(self, V=None):
# make sure the user supplied V is of the right shape
try:
assert V.shape in ((self.S,), (1, self.S)), "V is not the " \
"right shape (Bellman operator)."
"right shape (Bellman operator)."
except AttributeError:
raise TypeError("V must be a numpy array or matrix.")
# Looping through each action the the Q-value matrix is calculated.
Expand Down Expand Up @@ -357,7 +356,6 @@ def setVerbose(self):


class FiniteHorizon(MDP):

"""A MDP solved using the finite-horizon backwards induction algorithm.
Parameters
Expand Down Expand Up @@ -455,7 +453,6 @@ def run(self):


class _LP(MDP):

"""A discounted MDP soloved using linear programming.
This class requires the Python ``cvxopt`` module to be installed.
Expand Down Expand Up @@ -556,7 +553,6 @@ def run(self):


class PolicyIteration(MDP):

"""A discounted MDP solved using the policy iteration algorithm.
Arguments
Expand Down Expand Up @@ -638,7 +634,7 @@ def __init__(self, transitions, reward, gamma, policy0=None,
# Make sure it is a numpy array
policy0 = _np.array(policy0)
# Make sure the policy is the right size and shape
assert policy0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
assert policy0.shape in ((self.S,), (self.S, 1), (1, self.S)), \
"'policy0' must a vector with length S."
# reshape the policy to be a vector
policy0 = policy0.reshape(self.S)
Expand Down Expand Up @@ -742,7 +738,7 @@ def _evalPolicyIterative(self, V0=0, epsilon=0.0001, max_iter=10000):
# number of iterations reached.
#
try:
assert V0.shape in ((self.S, ), (self.S, 1), (1, self.S)), \
assert V0.shape in ((self.S,), (self.S, 1), (1, self.S)), \
"'V0' must be a vector of length S."
policy_V = _np.array(V0).reshape(self.S)
except AttributeError:
Expand Down Expand Up @@ -835,7 +831,7 @@ def run(self):
v_cumulative = []

self.p_cumulative = []

run_stats = []
while True:
self.iter += 1
take_run_stat = self.iter % self.run_stat_frequency == 0 or self.iter == self.max_iter
Expand All @@ -857,13 +853,16 @@ def run(self):
# value alone
policy_next, next_v = self._bellmanOperator()
error = _np.absolute(next_v - policy_V).max()
run_stats.append(self._build_run_stat(i=self.iter, s=None, a=None, r=_np.max(policy_V),
p=policy_next, v=policy_V, error=error))

if take_run_stat:
error_cumulative.append(error)
if len(error_cumulative) == 100:
self.error_mean.append(_np.mean(error_cumulative))
error_cumulative = []
self.run_stats.append(self._build_run_stat(i=self.iter, s=None, a=None, r=_np.max(policy_V),
p=policy_next, v=policy_V, error=error))
self.run_stats.append(run_stats[-1])
run_stats = []
del next_v
# calculate in how many places does the old policy disagree with
# the new policy
Expand Down Expand Up @@ -892,12 +891,12 @@ def run(self):
self.v_mean.append(_np.mean(v_cumulative, axis=1))
if len(error_cumulative) > 0:
self.error_mean.append(_np.mean(error_cumulative))

if self.run_stats is None or len(self.run_stats) == 0:
self.run_stats = run_stats
return self.run_stats


class PolicyIterationModified(PolicyIteration):

"""A discounted MDP solved using a modified policy iteration algorithm.
Arguments
Expand Down Expand Up @@ -1010,7 +1009,6 @@ def run(self):


class QLearning(MDP):

"""A discounted MDP solved using the Q learning algorithm.
Parameters
Expand Down Expand Up @@ -1142,6 +1140,7 @@ def run(self):
# initial state choice
s = _np.random.randint(0, self.S)
reset_s = False
run_stats = []
for n in range(1, self.max_iter + 1):

take_run_stat = n % self.run_stat_frequency == 0 or n == self.max_iter
Expand Down Expand Up @@ -1189,6 +1188,8 @@ def run(self):
p = self.Q.argmax(axis=1)
self.policy = p

run_stats.append(self._build_run_stat(i=n, s=s, a=a, r=r, p=p, v=v, error=error))

if take_run_stat:
error_cumulative.append(error)

Expand All @@ -1207,11 +1208,12 @@ def run(self):
"""
Rewards,errors time at each iteration I think
But that’s for all of them and steps per episode?
Alpha decay and min ?
And alpha and epsilon at each iteration?
"""
self.run_stats.append(self._build_run_stat(i=n, s=s, a=a, r=r, p=p, v=v, error=error))
self.run_stats.append(run_stats[-1])
run_stats = []

if self.iter_callback is not None:
reset_s = self.iter_callback(s, a, s_new)
Expand All @@ -1233,7 +1235,8 @@ def run(self):
self.v_mean.append(_np.mean(v_cumulative, axis=1))
if len(error_cumulative) > 0:
self.error_mean.append(_np.mean(error_cumulative))

if self.run_stats is None or len(self.run_stats) == 0:
self.run_stats = run_stats
return self.run_stats

def _build_run_stat(self, i, a, error, p, r, s, v):
Expand All @@ -1255,7 +1258,6 @@ def _build_run_stat(self, i, a, error, p, r, s, v):


class RelativeValueIteration(MDP):

"""A MDP solved using the relative value iteration algorithm.
Arguments
Expand Down Expand Up @@ -1327,7 +1329,7 @@ def __init__(self, transitions, reward, epsilon=0.01, max_iter=1000,
skip_check=False):
# Initialise a relative value iteration MDP.

MDP.__init__(self, transitions, reward, None, epsilon, max_iter,
MDP.__init__(self, transitions, reward, None, epsilon, max_iter,
skip_check=skip_check)

self.epsilon = epsilon
Expand Down Expand Up @@ -1373,7 +1375,6 @@ def run(self):


class ValueIteration(MDP):

"""A discounted MDP solved using the value iteration algorithm.
Description
Expand Down Expand Up @@ -1502,7 +1503,7 @@ def __init__(self, transitions, reward, gamma, epsilon=0.01,
self.V = _np.zeros(self.S)
else:
assert len(initial_value) == self.S, "The initial value must be " \
"a vector of length S."
"a vector of length S."
self.V = _np.array(initial_value).reshape(self.S)
if self.gamma < 1:
# compute a bound for the number of iterations and update the
Expand Down Expand Up @@ -1574,7 +1575,7 @@ def run(self):
self.v_mean = []
self.error_mean = []
self.p_cumulative = []

run_stats = []
while True:
self.iter += 1
take_run_stat = self.iter % self.run_stat_frequency == 0 or self.iter == self.max_iter
Expand All @@ -1584,11 +1585,12 @@ def run(self):
# Bellman Operator: compute policy and value functions
self.policy, self.V = self._bellmanOperator()


# The values, based on Q. For the function "max()": the option
# "axis" means the axis along which to operate. In this case it
# finds the maximum of the the rows. (Operates along the columns?)
error = _util.getSpan(self.V - Vprev)
run_stats.append(self._build_run_stat(i=self.iter, s=None, a=None, r=_np.max(self.V),
p=self.policy, v=self.V, error=error))
if take_run_stat:
error_cumulative.append(error)
if len(self.p_cumulative) == 0 or not _np.array_equal(self.policy, self.p_cumulative[-1][1]):
Expand All @@ -1600,8 +1602,8 @@ def run(self):
self.error_mean.append(_np.mean(error_cumulative))
error_cumulative = []

self.run_stats.append(self._build_run_stat(i=self.iter, s=None, a=None, r=_np.max(self.V),
p=self.policy, v=self.V, error=error))
self.run_stats.append(run_stats[-1])
run_stats = []

if self.verbose:
_printVerbosity(self.iter, error)
Expand All @@ -1617,11 +1619,13 @@ def run(self):

self._endRun()

# catch stragglers
if len(v_cumulative) > 0:
self.v_mean.append(_np.mean(v_cumulative, axis=1))
if len(error_cumulative) > 0:
self.error_mean.append(_np.mean(error_cumulative))

if self.run_stats is None or len(self.run_stats) == 0:
self.run_stats = run_stats
return self.run_stats

def _build_run_stat(self, i, s, a, r, p, v, error):
Expand All @@ -1643,7 +1647,6 @@ def _build_run_stat(self, i, s, a, r, p, v, error):


class ValueIterationGS(ValueIteration):

"""
A discounted MDP solved using the value iteration Gauss-Seidel algorithm.
Expand Down

0 comments on commit aacab34

Please sign in to comment.