-
Notifications
You must be signed in to change notification settings - Fork 0
/
layer.py
executable file
·612 lines (470 loc) · 33.6 KB
/
layer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
import numpy as np
from experience_buffer import ExperienceBuffer, PrioritizedReplayBuffer
import torch
from collections import defaultdict
from utils import oracle_action
from copy import deepcopy
from utils import project_state, render_image_for_video
class Layer():
def __init__(self, layer_number, FLAGS, env, sess, agent_params):
self.layer_number = layer_number
self.FLAGS = FLAGS
self.sess = sess
self.last_layer = self.layer_number == self.FLAGS.layers-1
self.sl_oracle = (self.FLAGS.sl_oracle and self.last_layer)
self.semi_oracle = (self.FLAGS.semi_oracle and self.last_layer)
self.relative_subgoals = self.FLAGS.relative_subgoals and (self.layer_number < self.FLAGS.layers-1)
# Set time limit for each layer. If agent uses only 1 layer, time limit is the max number of low-level actions allowed in the episode (i.e, env.max_actions).
if FLAGS.layers > 1:
self.time_limit = FLAGS.time_scale
else:
self.time_limit = env.max_actions
self.current_state = None
self.current_image = None
self.current_goal_image = None
self.goal = None
# Initialize Replay Buffer. Below variables determine size of replay buffer.
# Ceiling on buffer size
self.buffer_size_ceiling = 10**7
# Number of full episodes stored in replay buffer
self.episodes_to_store = agent_params["episodes_to_store"]
# Set number of transitions to serve as replay goals during goal replay
self.num_replay_goals = 2
self.attempts_made = 0
# Number of the transitions created for each attempt (i.e, action replay + goal replay + subgoal testing)
if self.layer_number == 0:
self.trans_per_attempt = (1 + self.num_replay_goals) * self.time_limit
else:
self.trans_per_attempt = (1 + self.num_replay_goals) * self.time_limit + int(self.time_limit/3)
# Buffer size = transitions per attempt * # attempts per episode * num of episodes stored
self.buffer_size = min(self.trans_per_attempt * self.time_limit**(self.FLAGS.layers-1 - self.layer_number) * self.episodes_to_store, self.buffer_size_ceiling)
# self.buffer_size = 10000000
self.batch_size = 1024
if not FLAGS.test:
buffer_class = PrioritizedReplayBuffer if (self.FLAGS.priority_replay and not self.sl_oracle) else ExperienceBuffer
self.replay_buffer = buffer_class(self.buffer_size, self.batch_size, device=self.sess, FLAGS=FLAGS, env=env, layer_number=self.layer_number)
# Create buffer to store not yet finalized goal replay transitions
self.temp_goal_replay_storage = []
# Initialize actor and critic networks
if self.FLAGS.torch:
if self.last_layer and self.FLAGS.vpn:
if self.FLAGS.vpn_dqn:
from vpn_dqn_actor import Actor
from vpn_dqn_critic import Critic
else:
from vpn_actor import Actor
from vpn_critic import Critic
self.vpn = True
else:
from torch_actor import Actor
from torch_critic import Critic
self.vpn = False
else:
from tf_actor import Actor
from tf_critic import Critic
if FLAGS.humanoid and self.layer_number == 0:
from humanoid_actor import HumanoidActor
self.actor = HumanoidActor(sess, env, FLAGS)
elif FLAGS.inject_her_policy and self.layer_number == 0:
from her_actor import HERActor
self.actor = HERActor(sess, env, FLAGS)
else:
critic_class = BayesianCritic if FLAGS.bayes else Critic
self.critic = critic_class(sess, env, self.layer_number, FLAGS)
rl_actor_class = SacActor if self.FLAGS.sac else Actor
actor_class = SLActor if (self.sl_oracle or self.semi_oracle) else rl_actor_class
self.actor = actor_class(sess, env, self.batch_size, self.layer_number, FLAGS, self.critic.vpn if self.vpn else None)
# Stores metrics for later aggregation
self.agg_metrics = defaultdict(list)
# Parameter determines degree of noise added to actions during training
# self.noise_perc = noise_perc
if self.layer_number == 0:
self.noise_perc = self.to_torch(agent_params["atomic_noise"])
elif self.last_layer and self.FLAGS.vpn:
self.noise_perc = self.to_torch(agent_params["vpn_noise"])
else:
self.noise_perc = self.to_torch(agent_params["subgoal_noise"])
if self.layer_number == 0:
self.action_bounds = self.to_torch(env.action_bounds)
self.action_offset = self.to_torch(env.action_offset)
else:
self.action_bounds = self.to_torch(env.subgoal_bounds_symmetric)
self.action_offset = self.to_torch(env.subgoal_bounds_offset)
# Create flag to indicate when layer has ran out of attempts to achieve goal. This will be important for subgoal testing
self.maxed_out = False
self.subgoal_penalty = agent_params["subgoal_penalty"]
self.subgoal_test_perc = agent_params["subgoal_test_perc"]
if self.last_layer and (self.FLAGS.always_penalize or self.FLAGS.Q_penalize):
self.subgoal_test_perc = 0.0
def to_torch(self, value):
return torch.tensor(value, dtype=torch.float32, device=self.sess)
def copy_transition(self, trans):
return [None if arr is None else torch.clone(arr) if isinstance(arr, torch.Tensor) else arr for arr in trans]
# Add noise to provided action
def add_noise(self,action, env):
# Noise added will be percentage of range
assert len(action) == len(self.action_bounds), "Action bounds must have same dimension as action"
assert len(action) == len(self.noise_perc), "Noise percentage vector must have same dimension as action"
# Add noise to action and ensure remains within bounds
action += torch.randn_like(action) * self.noise_perc * self.action_bounds
# Clip the actions to be in range.
action = torch.max(torch.min(action, self.action_bounds + self.action_offset), -self.action_bounds+self.action_offset)
return action
# Add noise to provided action
def add_target_noise(self, actions, env):
if not self.FLAGS.td3:
return actions
# Noise added will be percentage of range
if self.layer_number == 0:
action_bounds = env.action_bounds
action_offset = env.action_offset
else:
action_bounds = env.subgoal_bounds_symmetric
action_offset = env.subgoal_bounds_offset
for action in actions:
assert len(action) == len(action_bounds), ("Action bounds must have same dimension as action", self.layer_number, len(action), len(action_bounds))
assert len(action) == len(self.noise_perc), "Noise percentage vector must have same dimension as action"
# Add noise to action and ensure remains within bounds
for i in range(len(action)):
if self.layer_number == 0:
noise_std = self.noise_perc[i] * action_bounds[i]
noise = np.clip(np.random.normal(0, 2*noise_std), -5*noise_std, 5*noise_std)
else:
noise_std = self.noise_perc[i] * action_bounds[i]
noise = np.clip(np.random.normal(0, noise_std), -.5, .5)
action[i] += noise
action[i] = max(min(action[i], action_bounds[i]+action_offset[i]), -action_bounds[i]+action_offset[i])
return actions
# Select random action
def get_random_action(self, env):
return torch.rand(len(self.action_bounds), dtype=torch.float32, device=self.sess) * 2 * (self.action_bounds) - self.action_bounds + self.action_offset
# Function selects action using an epsilon-greedy policy
def choose_action(self,agent, env, subgoal_test):
action, next_subgoal_test = None, None
# If testing mode or testing subgoals, action is output of actor network without noise
if agent.FLAGS.test or subgoal_test:
current_image = self.current_goal_image.unsqueeze(0) if (self.FLAGS.vpn and self.last_layer) else None
action = self.actor.get_action(self.current_state.unsqueeze(0), self.goal.unsqueeze(0), current_image, noise=False).squeeze(0)
next_subgoal_test = subgoal_test
else:
if np.random.random_sample() > 0.2:
# Choose noisy action
current_image = self.current_goal_image.unsqueeze(0) if (self.FLAGS.vpn and self.last_layer) else None
action = self.actor.get_action(self.current_state.unsqueeze(0), self.goal.unsqueeze(0), current_image).squeeze(0)
action = action if self.FLAGS.sac else self.add_noise(action, env)
# Otherwise, choose random action
else:
action = self.get_random_action(env)
if self.relative_subgoals and self.layer_number > 0:
action -= project_state(env, self.FLAGS, self.layer_number, self.current_state)
# Determine whether to test upcoming subgoal
if np.random.random_sample() < self.subgoal_test_perc:
next_subgoal_test = True
else:
next_subgoal_test = False
return action, next_subgoal_test
# Create action replay transition by evaluating hindsight action given original goal
def perform_action_replay(self, hindsight_action, next_state, goal_status, action_label):
# Determine reward (0 if goal achieved, -1 otherwise) and finished boolean. The finished boolean is used for determining the target for Q-value updates
if goal_status[self.layer_number]:
reward = 0
finished = True
else:
reward = -1
finished = False
# Transition will take the form [old state, hindsight_action, reward, next_state, goal, terminate boolean, None]
transition = [self.current_state, hindsight_action, reward, next_state, self.goal, finished, None, action_label, self.current_goal_image]
if self.FLAGS.all_trans or self.FLAGS.hind_action:
print("\nLevel %d Hindsight Action: " % self.layer_number, transition)
# Add action replay transition to layer's replay buffer
self.replay_buffer.add(self.copy_transition(transition))
# Create initial goal replay transitions
def create_prelim_goal_replay_trans(self, hindsight_action, next_state, env, total_layers):
# Create transition evaluating hindsight action for some goal to be determined in future. Goal will be ultimately be selected from states layer has traversed through. Transition will be in the form [old state, hindsight action, reward = None, next state, goal = None, finished = None, next state projeted to subgoal/end goal space]
if self.layer_number == total_layers - 1 or (self.layer_number == total_layers -2 and self.FLAGS.oracle):
hindsight_goal = env.project_state_to_end_goal(env.sim, next_state)
else:
hindsight_goal = env.project_state_to_subgoal(env.sim, next_state)
# state, action, reward, next_state, goal, terminal, global_hindsight_goal
transition = [self.current_state, hindsight_action, None, next_state, None, None, hindsight_goal, None, self.current_image]
if self.FLAGS.all_trans or self.FLAGS.prelim_HER:
print("\nLevel %d Prelim HER: " % self.layer_number, transition)
self.temp_goal_replay_storage.append(self.copy_transition(transition))
"""
# Designer can create some additional goal replay transitions. For instance, higher level transitions can be replayed with the subgoal achieved in hindsight as the original goal.
if self.layer_number > 0:
transition_b = [self.current_state, hindsight_action, 0, next_state, hindsight_goal, True, None]
# print("\nGoal Replay B: ", transition_b)
self.replay_buffer.add(np.copy(transition_b))
"""
# Return reward given provided goal and goal achieved in hindsight
def get_reward(self, new_global_goal, global_hindsight_goal, goal_thresholds):
assert len(new_global_goal) == len(global_hindsight_goal) == len(goal_thresholds), "Goal, hindsight goal, and goal thresholds do not have same dimensions"
# If the difference in any dimension is greater than threshold, goal not achieved
if (torch.abs(new_global_goal-global_hindsight_goal) > goal_thresholds).any():
return -1
# Else goal is achieved
return 0
# Finalize goal replay by filling in goal, reward, and finished boolean for the preliminary goal replay transitions created before
def finalize_goal_replay(self, env, goal_thresholds):
# Choose transitions to serve as goals during goal replay. The last transition will always be used
num_trans = len(self.temp_goal_replay_storage)
if num_trans == 0:
return
# If fewer transitions that ordinary number of replay goals, lower number of replay goals
num_replay_goals = min(self.num_replay_goals, num_trans)
if self.FLAGS.all_trans or self.FLAGS.HER:
print("\n\nPerforming Goal Replay for Level %d\n\n" % self.layer_number)
print("Num Trans: ", num_trans, ", Num Replay Goals: ", num_replay_goals)
# For each selected transition, update the goal dimension of the selected transition and all prior transitions by using the next state of the selected transition as the new goal. Given new goal, update the reward and finished boolean as well.
for index in range(num_trans):
for i in range(num_replay_goals):
if i == num_replay_goals -1:
future_index = num_trans-1
else:
future_index = np.random.randint(index, num_trans)
new_global_goal = torch.clone(self.temp_goal_replay_storage[future_index][6])
trans_copy = [None if item is None else torch.clone(item) for item in self.temp_goal_replay_storage[index]]
# Update goal to new goal
if self.last_layer and self.FLAGS.vpn:
trans_copy[8] = torch.stack([trans_copy[8], env.pos_image(new_global_goal, trans_copy[8])], dim=0)
if self.relative_subgoals:
state_pos = project_state(env, self.FLAGS, self.layer_number, trans_copy[0])
trans_copy[4] = (new_global_goal - state_pos)
else:
trans_copy[4] = new_global_goal
# Update reward
trans_copy[2] = self.get_reward(new_global_goal, trans_copy[6], goal_thresholds)
# Update finished boolean based on reward
if trans_copy[2] == 0:
trans_copy[5] = True
else:
trans_copy[5] = False
# Add finished transition to replay buffer
if self.FLAGS.all_trans or self.FLAGS.HER:
print("\nNew Goal: ", new_global_goal)
print("Upd Trans %d: " % index, trans_copy)
self.replay_buffer.add(trans_copy)
# Clear storage for preliminary goal replay transitions at end of goal replay
self.temp_goal_replay_storage = []
# Create transition penalizing subgoal if necessary. The target Q-value when this transition is used will ignore next state as the finished boolena = True. Change the finished boolean to False, if you would like the subgoal penalty to depend on the next state.
def penalize_subgoal(self, subgoal, next_state, high_level_goal_achieved, action_label):
transition = [self.current_state, subgoal, self.subgoal_penalty, next_state, self.goal, True, None, action_label, self.current_goal_image]
if self.FLAGS.all_trans or self.FLAGS.penalty:
print("Level %d Penalty Trans: " % self.layer_number, transition)
self.replay_buffer.add(self.copy_transition(transition))
# Determine whether layer is finished training
def return_to_higher_level(self, max_lay_achieved, agent, env, attempts_made):
# Return to higher level if (i) a higher level goal has been reached, (ii) maxed out episode time steps (env.max_actions), (iii) not testing and layer is out of attempts, and (iv) testing, layer is not the highest level, and layer is out of attempts. NOTE: during testing, highest level will continue to ouput subgoals until either (i) the maximum number of episdoe time steps or (ii) the end goal has been achieved.
# Return to previous level when any higher level goal achieved. NOTE: if not testing and agent achieves end goal, training will continue until out of time (i.e., out of time steps or highest level runs out of attempts). This will allow agent to experience being around the end goal.
if max_lay_achieved is not None and max_lay_achieved >= self.layer_number:
return True
if not env.healthy:
return True
# Return when out of time
elif agent.steps_taken >= env.max_actions:
return True
# Return when layer has maxed out attempts
elif not agent.FLAGS.test and attempts_made >= self.time_limit:
return True
# NOTE: During testing, agent will have env.max_action attempts to achieve goal
elif agent.FLAGS.test and self.layer_number < agent.FLAGS.layers-1 and attempts_made >= self.time_limit:
return True
else:
return False
def transform_path(self, path, offset):
for node in path:
node[0] -= offset
node[1] -= offset
return path
# Learn to achieve goals with actions belonging to appropriate time scale. "goal_array" contains the goal states for the current layer and all higher layers
def train(self, agent, env, metrics, subgoal_test = False, episode_num = None):
# print("\nTraining Layer %d" % self.layer_number)
# Set layer's current state and new goal state
self.goal = agent.goal_array[self.layer_number].clone()
self.current_state = agent.current_state
if self.last_layer and self.FLAGS.vpn:
self.current_image = self.to_torch(env.take_snapshot())
self.current_goal_image = torch.stack([self.current_image, env.pos_image(self.goal, self.current_image)], dim=0)
# Reset flag indicating whether layer has ran out of attempts. This will be used for subgoal testing.
self.maxed_out = False
# Display all subgoals if visualizing training and current layer is bottom layer
if self.layer_number == 0 and (agent.FLAGS.show or agent.FLAGS.save_video) and agent.FLAGS.layers > 1:
env.display_subgoals([arr.cpu().numpy() for arr in agent.goal_array], agent.FLAGS)
# Current layer has self.time_limit attempts to each its goal state.
self.attempts_made = 0
while True:
# Select action to achieve goal state using epsilon-greedy policy or greedy policy if in test mode
action, next_subgoal_test = self.choose_action(agent, env, subgoal_test)
if self.layer_number > 0 and not (self.FLAGS.humanoid or self.FLAGS.inject_her_policy):
subgoal_distance = torch.norm(action[:2] if self.relative_subgoals else (action[:2] - self.current_state[:2])).item()
self.agg_metrics['subgoal_distances'].append(subgoal_distance)
goal_subgoal_distance = torch.norm((self.goal[:2] - self.current_state[:2] - action[:2]) if self.relative_subgoals else (self.goal[:2] - action[:2])).item()
self.agg_metrics['goal_subgoal_distance'].append(goal_subgoal_distance)
lower_layer = agent.layers[self.layer_number-1]
lower_action = lower_layer.actor.get_action(self.current_state.unsqueeze(0), action.unsqueeze(0), None)[0]
lower_Q = lower_layer.critic.get_Q_value(self.current_state.unsqueeze(0), action.unsqueeze(0), lower_action.unsqueeze(0), None).item()
self.agg_metrics['lower_Q_val'].append(lower_Q)
# If next layer is not bottom level, propose subgoal for next layer to achieve and determine whether that subgoal should be tested
if self.layer_number > 0:
action_copy = action.clone()
if self.FLAGS.relative_subgoals and self.last_layer:
action_copy -= project_state(env, self.FLAGS, self.layer_number, self.current_state)
agent.goal_array[self.layer_number - 1] = action_copy
goal_status, max_lay_achieved = agent.layers[self.layer_number - 1].train(agent, env, metrics, next_subgoal_test, episode_num)
if self.last_layer and self.FLAGS.learn_sigma and not agent.FLAGS.test:
self.actor.bandit.store_reward(agent.layers[self.layer_number - 1].attempts_made)
# If layer is bottom level, execute low-level action
else:
next_state = self.to_torch(env.execute_action(action.cpu().numpy()))
if self.FLAGS.save_video:
real_image = render_image_for_video(env, self.FLAGS, agent, next_state)
agent.image_path.append(real_image)
# Increment steps taken
agent.steps_taken += 1
if not self.FLAGS.test:
agent.total_steps_taken += 1
if agent.steps_taken >= env.max_actions:
print("Out of actions (Steps: %d)" % agent.steps_taken)
agent.current_state = next_state
if self.FLAGS.relative_subgoals:
for i_layer in range(self.FLAGS.layers-1):
old_pos = project_state(env, self.FLAGS, i_layer, self.current_state)
new_pos = project_state(env, self.FLAGS, i_layer, agent.current_state)
agent.goal_array[i_layer] = agent.goal_array[i_layer] + old_pos - new_pos
# Determine whether any of the goals from any layer was achieved and, if applicable, the highest layer whose goal was achieved
goal_status, max_lay_achieved = agent.check_goals(env)
self.attempts_made += 1
# Perform hindsight learning using action actually executed (low-level action or hindsight subgoal)
if self.layer_number == 0:
hindsight_action = action
else:
# If subgoal action was achieved by layer below, use this as hindsight action
if goal_status[self.layer_number-1]:
hindsight_action = action
# Otherwise, use subgoal that was achieved in hindsight
else:
if self.relative_subgoals:
hindsight_action = torch.clone(env.project_state_to_subgoal(env.sim, agent.current_state) - env.project_state_to_subgoal(env.sim, self.current_state))
else:
hindsight_action = torch.clone(env.project_state_to_subgoal(env.sim, agent.current_state))
# Next, create hindsight transitions if not testing and env still healthy
if not agent.FLAGS.test and env.healthy:
if self.sl_oracle:
transition = [self.current_state, action, 0, agent.current_state, self.goal, True, None, oracle_action(self.FLAGS, self.current_state, self.goal, env)]
self.replay_buffer.add(self.copy_transition(transition))
else:
# Create action replay transition by evaluating hindsight action given current goal
self.perform_action_replay(hindsight_action, agent.current_state, goal_status, oracle_action(self.FLAGS, self.current_state, self.goal, env) if self.semi_oracle else None)
# Create preliminary goal replay transitions. The goal and reward in these transitions will be finalized when this layer has run out of attempts or the goal has been achieved.
self.create_prelim_goal_replay_trans(hindsight_action, agent.current_state, env, agent.FLAGS.layers)
# Penalize subgoals if subgoal testing and subgoal was missed by lower layers after maximum number of attempts
if self.layer_number > 0 and next_subgoal_test and agent.layers[self.layer_number-1].maxed_out:
self.penalize_subgoal(action, agent.current_state, goal_status[self.layer_number], oracle_action(self.FLAGS, self.current_state, self.goal, env) if self.semi_oracle else None)
# Penalize subgoals for the highest level if always penalization on and the lower layers ran out of attempts.
elif self.last_layer and self.FLAGS.always_penalize and agent.layers[self.layer_number-1].maxed_out:
self.penalize_subgoal(action, agent.current_state, goal_status[self.layer_number], oracle_action(self.FLAGS, self.current_state, self.goal, env) if self.semi_oracle else None)
# Penalize subgoals if the lower level think the goal is reachable, but it couldn't reach it. Probably a wall.
elif self.last_layer and self.FLAGS.Q_penalize:
lower_layer = agent.layers[self.layer_number-1]
action_copy = action.clone()
if self.FLAGS.relative_subgoals:
action_copy -= project_state(env, self.FLAGS, self.layer_number, self.current_state)
lower_action,_ = lower_layer.actor.get_target_action(self.current_state.unsqueeze(0), action_copy.unsqueeze(0), None)
lower_Q_val = lower_layer.critic.get_target_Q_value(self.current_state.unsqueeze(0), action_copy.unsqueeze(0), lower_action, None).item()
if lower_Q_val >= -self.FLAGS.time_scale+2 and agent.layers[self.layer_number-1].maxed_out:
self.penalize_subgoal(action, agent.current_state, goal_status[self.layer_number], oracle_action(self.FLAGS, self.current_state, self.goal, env) if self.semi_oracle else None)
elif not agent.FLAGS.test and not env.healthy and self.layer_number == 0:
self.penalize_subgoal(action, agent.current_state, goal_status[self.layer_number], oracle_action(self.FLAGS, self.current_state, self.goal, env) if self.semi_oracle else None)
# Print summary of transition
if agent.FLAGS.verbose:
print("\nEpisode %d, Level %d, Attempt %d" % (episode_num, self.layer_number, self.attempts_made))
# print("Goal Array: ", agent.goal_array, "Max Lay Achieved: ", max_lay_achieved)
print("Old State: ", self.current_state)
print("Hindsight Action: ", hindsight_action)
print("Original Action: ", action)
print("Next State: ", agent.current_state)
print("Goal: ", self.goal)
if self.layer_number == agent.FLAGS.layers - 1:
print("Hindsight Goal: ", env.project_state_to_end_goal(env.sim, agent.current_state))
else:
print("Hindsight Goal: ", env.project_state_to_subgoal(env.sim, agent.current_state))
print("Goal Status: ", goal_status, "\n")
print("All Goals: ", agent.goal_array)
# Update state of current layer
self.current_state = agent.current_state
if self.relative_subgoals:
self.goal = agent.goal_array[self.layer_number].clone()
if self.layer_number == 0 and (agent.FLAGS.show or agent.FLAGS.save_video) and agent.FLAGS.layers > 1:
env.display_subgoals([arr.cpu().numpy() for arr in agent.goal_array], agent.FLAGS)
if self.last_layer and self.FLAGS.vpn:
self.current_image = self.to_torch(env.take_snapshot())
self.current_goal_image = torch.stack([self.current_image, env.pos_image(self.goal, self.current_image)], dim=0)
# Return to previous level to receive next subgoal if applicable
# if self.return_to_higher_level(max_lay_achieved, agent, env, attempts_made):
if (max_lay_achieved is not None and max_lay_achieved >= self.layer_number) or agent.steps_taken >= env.max_actions or self.attempts_made >= self.time_limit or not env.healthy:
# If goal was not achieved after max number of attempts, set maxed out flag to true
if self.attempts_made >= self.time_limit and not goal_status[self.layer_number]:
self.maxed_out = True
# If not testing, finish goal replay by filling in missing goal and reward values before returning to prior level.
if not agent.FLAGS.test:
if self.layer_number == agent.FLAGS.layers - 1 or (self.layer_number == agent.FLAGS.layers -2 and self.FLAGS.oracle):
goal_thresholds = self.to_torch(env.end_goal_thresholds)
else:
goal_thresholds = self.to_torch(env.subgoal_thresholds)
if self.last_layer == 1 and self.FLAGS.radius_learner:
agent.radius_learner.add_paths(env, self.FLAGS, self.temp_goal_replay_storage)
if not self.sl_oracle:
self.finalize_goal_replay(env, goal_thresholds)
# Under certain circumstances, the highest layer will not seek a new end goal
if self.return_to_higher_level(max_lay_achieved, agent, env, self.attempts_made):
if self.layer_number == agent.FLAGS.layers-1 and agent.FLAGS.test:
print("HL Attempts Made: ", self.attempts_made)
return goal_status, max_lay_achieved
# Update actor and critic networks
def learn(self, env, agent, num_updates, metrics):
# To use target networks comment for loop above and uncomment for loop below
for j in range(num_updates):
# Update weights of non-target networks
if self.last_layer and self.FLAGS.learn_sigma and self.actor.bandit.replay_buffer.size > 100:
self.actor.bandit.update(j, metrics)
if self.replay_buffer.size >= 250:
idx, (old_states, actions, rewards, new_states, goals, is_terminals, oracle_actions, images), is_weights = self.replay_buffer.get_batch()
if self.relative_subgoals:
new_goals = []
new_goals = goals + project_state(env, self.FLAGS, self.layer_number, old_states) - project_state(env, self.FLAGS, self.layer_number, new_states)
else:
new_goals = goals
next_batch_size = min(self.replay_buffer.size, self.replay_buffer.batch_size)
next_action, next_entropy = self.actor.get_target_action(new_states,new_goals, images)
errors = self.critic.update(old_states, actions, rewards, new_states, goals, new_goals, next_action, is_terminals, is_weights, next_entropy, images, metrics, total_steps_taken=agent.total_steps_taken)
self.replay_buffer.batch_update(idx, errors)
action_derivs = self.critic.get_gradients_for_actions(old_states, goals, self.actor, images)
goal_derivs = None
if self.FLAGS.actor_grads and self.layer_number > 0 and self.last_layer:
subgoals = self.actor.get_target_action_for_goal_grads(old_states, images)
if self.FLAGS.relative_subgoals and self.last_layer:
assert len(subgoals) == len(old_states)
subgoals = subgoals - project_state(env, self.FLAGS, self.layer_number, old_states)
goal_derivs = agent.layers[self.layer_number-1].critic.get_gradients_for_goals(old_states, subgoals, agent.layers[self.layer_number-1].actor.get_target_action(old_states, subgoals, None)[0])
if self.layer_number > 0:
lower_critic = agent.layers[self.layer_number-1].critic
lower_actor = agent.layers[self.layer_number-1].actor
subgoals = self.actor.get_target_action(old_states, goals, images)[0]
if self.FLAGS.relative_subgoals and self.last_layer:
assert len(subgoals) == len(old_states)
subgoals = subgoals - project_state(env, self.FLAGS, self.layer_number, old_states)
if self.FLAGS.torch:
Q_val_lower = lower_critic.get_target_Q_value(old_states, subgoals, lower_actor.get_target_action(old_states, subgoals, None)[0], None)
metrics['buffer/Q_val_lower%d' % self.layer_number] = torch.mean(Q_val_lower).item()
metrics['buffer/Q_val_lower_clipped%d' % self.layer_number] = torch.mean((Q_val_lower < -self.FLAGS.time_scale+1e-6).float()).item()
metrics['buffer/Q_val_lower_too_low%d' % self.layer_number] = torch.mean((Q_val_lower > -1.5).float()).item()
if (not self.FLAGS.td3) or (j % 2 == 0):
if self.sl_oracle or self.semi_oracle:
self.actor.update(old_states, goals, action_derivs, next_batch_size, oracle_actions, metrics, goal_derivs)
else:
self.actor.update(old_states, goals, action_derivs, next_batch_size, metrics, goal_derivs)
# Update weights of target networks
if not self.FLAGS.no_target_net:
self.critic.update_target_weights()
self.actor.update_target_weights()