-
Notifications
You must be signed in to change notification settings - Fork 2
/
tankscustom.py
334 lines (267 loc) · 13.4 KB
/
tankscustom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""
This script implements control with faults on a fuel tank system
on a cargo plane represented by a custom simulator. There are 6 tanks. 4 of
the tanks are primary tanks and have outputs to engines. The remaining 2 are
auxiliary tanks which feed into the primary tanks. The system drains outer tanks
first before using inner tanks to feed engines. Faults in the system are leak(s)
in fuel tanks. Only a single fault occurs at a time.
For more information, see models/fuel_tanks.py
Two kinds of control are implemented:
- Reinforcement learning: The controller explores a sample of the state space
to estimate values for different actions. Actions with the largest
values are picked at each step. Function approximation is used for
value estimation.
- Model predictive control: The controller samples each of the possible
reachable states within a timestep horizon and picks a state closest
to the goal.
It is assumed that the controller has an accurate model of the system.
Fuel tanks are arranged physically (and indexed) as:
1 2 LAux | RAux 3 4
[1 2 3 4 5 6]
Usage:
> python tanks.py --help # view arguments help
> python tanks.py -x # simply simulate the tanks
> python tanks.py -x -f 3 # simulate tanks with fault in third tank (LAux)
> python .\tankscustomdemo.py -c 2e-4 -f 6 -r 0.2 -s 5 -m 10 -e 0.75
> python .\tankscustomdemo.py --usempc -m 1
Default model and learning parameters can be changed below. Some of them
can be tuned from the command-line.
Requires:
flask,
numpy,
scipy
"""
import math
import random
import flask
import numpy as np
from scipy.integrate import trapz
from argparse import ArgumentParser, RawTextHelpFormatter
from qlearn import SLearner
from qlearn import FlagGenerator
from models import SixTankModel
# Default configuration parameters
COVERAGE = 1e-4 # Fraction of states to cover in learning initially
LRATE = 1e-1 # Learning rate (0, 1]
DISCOUNT = 0.75 # Discount factor (0, 1]
EXPLORATION = 0 # Exploration while recommending actions [0, 1]
POLICY = SLearner.SOFTMAX # The action selection policy
DEPTH = 10 # Number of steps at most in each learning episode
STEPS = 1 # Number of steps to look ahead during learning
DENSITY = 1.0 # Fraction of neighbouring states sampled for episodes when exploring
SEED = None # Random number seed
FAULT = list(range(7)) # Default set of faults
DELTA_T = 1 # step size of simulation
FUNCDIM = 7 # dimensions in value function
# Set up command-line configuration
args = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
args.add_argument('-i', '--initial', metavar=tuple(['L']*6 + ['A']*6), type=float,
nargs=12, help="Initial tank levels (0-100) and switch values (0/1).",
default=[100, 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0])
args.add_argument('-c', '--coverage', metavar='C', type=float,
help="Fraction of states to cover in learning", default=COVERAGE)
args.add_argument('-r', '--rate', metavar='R', type=float,
help="Learning rate (0, 1]", default=LRATE)
args.add_argument('-d', '--discount', metavar='D', type=float,
help="Discount factor (0, 1]", default=DISCOUNT)
args.add_argument('-e', '--explore', metavar='E', type=float,
help="Exploration while recommending actions [0, 1]", default=EXPLORATION)
args.add_argument('-s', '--steps', metavar='S', type=int,
help="Number of steps to look ahead during learning", default=STEPS)
args.add_argument('-m', '--maxdepth', metavar='M', type=int,
help="Number of steps at most in each learning episode", default=DEPTH)
args.add_argument('-f', '--fault', metavar='F', type=int, nargs='+',
help="List of faults. For server, first fault is chosen.", default=FAULT)
args.add_argument('-p', '--policy', metavar='P', choices=['uniform', 'softmax', 'greedy'],
help="The action selection policy", default=POLICY)
args.add_argument('--seed', metavar='SEED', type=int,
help="Random number seed", default=SEED)
args.add_argument('-x', '--disable', action='store_true',
help="Learning disabled if included", default=False)
args.add_argument('--usempc', action='store_true',
help="Use model predictive controller.", default=False)
args.add_argument('--hierarchical', action='store_true',
help="Hierarchical state space traversal.", default=False)
args.add_argument('--density', type=float,
help="State sampling density (0, 1]. 1 => all neighbours sampled.", default=DENSITY)
args.add_argument('--numtrials', type=int, metavar='N',
help="Run trials instead of interactive server.", default=None)
args.add_argument('--noise', type=float, metavar='N',
help="Amount of noise in model behaviour.", default=0.0)
args.add_argument('--verbose', action='store_true',
help="Print parameters used.", default=False)
ARGS = args.parse_args()
class ModelPredictiveController(SLearner):
"""
Creates a subclass of SLearner that uses Model Predictive
Control to recommend actions. MPC does not learn a value function but
instead does a receding horizon look-ahead at each timestep while
choosing the action optimizing some static utility function.
Args:
dmap (func): A function that takes the state vector and returns a number
representing the "distance" from ideal state.
simulator: An object with a run() function that takes state and action
vectors and an optional stepsize argument. Returns the next state
vector.
state/actionconverter (FlagGenerator): Encodes/Decodes vectors into
integer representation (mostly for compatibility w/ SLearner)
depth (int): Maximum horizon to look ahead.
density (float): The fraction of neighbouring states to sample.
seed (int): Random number generator seed. Otherwise random.
"""
def __init__(self, dmap, simulator, stateconverter, actionconverter, depth=1,
density=1, seed=None):
self.random = np.random.RandomState() if seed is None else np.random.RandomState(seed)
self.dmap = dmap # cost measure to minimize
self.depth = depth
self.density = density
self.simulator = simulator
self.stateconverter = stateconverter
self.actionconverter = actionconverter
self.funcdim = 1 # for compatibility
self._avecs = [avec for avec in self.actionconverter]
self.weights = np.ones((1, 13)) # just for compatibility
def learn(self, *args, **kwargs):
"""
An MPC has no learning phase.
"""
return [[]], [[]]
def recommend(self, state, **kwargs):
"""
Implements the receding horizon online supervision algorithm by
Abdelwahed et al.
"""
min_dist = np.inf
optimal = None
tree = [(None, state, 0, None)] # (parent ref, state, depth, action)
while len(tree):
cnode = tree.pop()
if cnode[2] == self.depth+1:
break
# add eligible states to be explored to tree
neighbours = self.neighbours(cnode[1])
self.random.shuffle(neighbours)
for action, nstate in enumerate(neighbours[:int(np.ceil(len(neighbours) * self.density))]):
node = (cnode, nstate, cnode[2]+1, action)
tree.insert(0, node)
# check state eligibility
if self.dmap(nstate) < min_dist:
min_dist = self.dmap(nstate)
optimal = node
# Trace back to first action
if optimal is None: # i.e. starting state is closest state, maintain action
return state[6:]
while optimal[0] is not None:
action = optimal[3]
optimal = optimal[0]
return self.actionconverter.decode(action)
def moment(s):
return abs(3 * (s[0] - s[5]) + \
2 * (s[1] - s[4]) + \
1 * (s[2] - s[3]))
def goal(state):
return sum(state[:6]) <= 5
def hierarchy(state):
return DELTA_T + int(np.log10(1 + moment(state)))
def reward(state, action, nstate, **kwargs):
# return(1000.0 / ((abs(state[0] - state[5])) + (abs(state[1] - state[4])) + (abs(state[2] - state[3])) + 1))
return (sum(nstate[:6]) / 600) + (1 / (1 + moment(nstate)))
def dfunc(state, action, weights):
# return np.concatenate((state[:6], action, [1])) / np.array([100, 100, 100, 100, 100, 100, 1, 1, 1, 1, 1, 1, 1])
return np.array([state[i] * (action[i] + 1) / 200 for i in range(6)] + [1])
def func(state, action, weights):
return np.dot(dfunc(state, action, weights), weights)
# The sampling grid over the state space. A total of 1,000,000 states.
STATES = FlagGenerator((20, 5, 100), (20, 5, 100), (20, 5, 100), (20, 5, 100),
(20, 5, 100), (20, 5, 100), 2, 2, 2, 2, 2, 2)
# The possible set of actions (64).
ACTIONS = FlagGenerator(2, 2, 2, 2, 2, 2)
# The system with a possible fault
SIM = SixTankModel(fault=ARGS.fault[0], noise=ARGS.noise, seed=ARGS.seed)
if not ARGS.usempc:
# Create the SLearner instance
LEARNER = SLearner(reward=reward, simulator=SIM, stateconverter=STATES,
actionconverter=ACTIONS, goal=goal, func=func, funcdim=FUNCDIM,
dfunc=dfunc, lrate=ARGS.rate, discount=ARGS.discount,
policy=ARGS.policy, depth=ARGS.maxdepth,
steps=ARGS.steps, seed=ARGS.seed,
stepsize=hierarchy if ARGS.hierarchical else lambda x:DELTA_T)
else:
LEARNER = ModelPredictiveController(dmap=moment, simulator=SIM,
stateconverter=STATES, actionconverter=ACTIONS,
depth=ARGS.maxdepth, seed=ARGS.seed,
density=ARGS.density)
# Print paramters if verbose
if ARGS.verbose:
for key, value in vars(ARGS).items():
try:
print('%12s: %-12s' % (key, value))
except:
pass
# Either run interactive server, or multiple trials
if ARGS.numtrials is None:
# Set up a server
APP = flask.Flask('Tanks', static_url_path='', static_folder='', template_folder='')
svec = np.zeros(12, dtype=float)
avec = np.zeros(6, dtype=int)
# Initial learning for RL controller
if not ARGS.disable and not ARGS.usempc:
LEARNER.learn(coverage=ARGS.coverage)
@APP.route('/')
def demo():
svec[:] = np.array(ARGS.initial)
avec[:] = ARGS.initial[6:]
return flask.render_template('demo.html', N=100, T=6,
L=['1', '2', 'LA', 'RA', '3', '4'],
O=[0, 1, 2, 3, 4, 5])
@APP.route('/status/')
def status():
s = list(svec) # cache last results
a = list(avec)
w = list(LEARNER.weights)
if not ARGS.disable:
if LEARNER.random.rand() <= ARGS.explore: # re-learn
episodes = LEARNER.neighbours(svec)
LEARNER.random.shuffle(episodes)
LEARNER.learn(episodes=episodes[:int(np.ceil(len(episodes) * ARGS.density))])
avec[:] = LEARNER.recommend(svec)
svec[:] = LEARNER.next_state(svec, avec) # compute new results
if goal(s):
exit('Goal state reached.')
imbalance = -moment(s)
return flask.jsonify(levels=[str(i) for i in s],
action=' '.join(['{:2d}'.format(a) for a in avec]),
weights=[str(i) for i in w],
imbalance=imbalance) # return cached results
APP.run(debug=1, use_reloader=False, use_evalex=False)
else:
# Run multiple trials
imbalances = [] # average imbalance for each trial
lengths = [] # length of each trial until goal
areas = [] # average areas under imbalance curves
for i in range(ARGS.numtrials):
LEARNER.simulator.fault = LEARNER.random.choice(ARGS.fault) # introduce new fault
if not ARGS.disable: # re-learn on new trial
LEARNER.reset()
LEARNER.learn(coverage=ARGS.coverage)
svec = np.array(ARGS.initial) # all trials start with specified initial state
avec = svec[6:]
imbalance = [moment(svec)]
length = 1
while True:
if not ARGS.disable:
if LEARNER.random.rand() <= ARGS.explore: # explore
episodes = LEARNER.neighbours(svec)
LEARNER.random.shuffle(episodes)
LEARNER.learn(episodes=episodes[:int(np.ceil(len(episodes) * ARGS.density))])
avec = LEARNER.recommend(svec) # exploit
svec = LEARNER.next_state(svec, avec)
imbalance.append(moment(svec))
length += 1
if goal(svec): # quit trial on goal
imbalances.append(max(imbalance))
lengths.append(length)
areas.append(trapz(imbalance))
break
print('MaxImbalance: {0:6.2f}\tLength: {1:6d}\tTotalImbalance: {2:6.2f}'\
.format(np.mean(imbalances), int(np.mean(lengths)), np.mean(areas)))