-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheval_policy.py
174 lines (144 loc) · 5.14 KB
/
eval_policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
This file is used only to evaluate our trained policy/actor after
training in main.py with ppo.py.
"""
import numpy as np
import torch
def _log_summary(ep_len, ep_ret, ep_num):
"""
Print to stdout what we've logged so far in the most recent episode.
Parameters:
None
Return:
None
"""
# Round decimal places for more aesthetic logging messages
ep_len = str(round(ep_len, 2))
ep_ret = str(round(ep_ret, 2))
# Print logging statements
print(flush=True)
print(f"-------------------- Episode #{ep_num} --------------------", flush=True)
print(f"Episodic Length: {ep_len}", flush=True)
print(f"Episodic Return: {ep_ret}", flush=True)
print(f"------------------------------------------------------", flush=True)
print(flush=True)
'''
Method for testing the learnt policy
Needs to be terminated manually
'''
def rollout(policy, env, render, is_discrete):
"""
Returns a generator to roll out each episode given a trained policy and
environment to test on.
Parameters:
policy - The trained policy to test
env - The environment to evaluate the policy on
render - Specifies whether to render or not
Return:
A generator object rollout, or iterable, which will return the latest
episodic length and return on each iteration of the generator.
Note:
If you're unfamiliar with Python generators, check this out:
https://wiki.python.org/moin/Generators
If you're unfamiliar with Python "yield", check this out:
https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
"""
# Rollout until user kills process
while True:
obs = env.reset()
done = False
# number of timesteps so far
t = 0
# Logging data
ep_len = 0 # episodic length
ep_ret = 0 # episodic return
traj = [obs]
actions = []
while not done:
t += 1
# Render environment if specified, off by default
if render:
env.render()
# Query deterministic action from policy and run it
if is_discrete:
action = choose_best_action(obs,policy)
else:
action = policy(obs).detach().numpy() #policy(obs).detach().numpy()
obs, rew, done, _ = env.step(action)
actions.append(action)
traj.append(obs)
# Sum all episodic rewards as we go along
ep_ret += rew
# Track episodic length
ep_len = t
'''if ep_ret < 200:
print(f'Reward ========= {ep_ret}')
print(f'traj start ========= {traj[0]}')
print(f'Action ========= {actions}')'''
# returns episodic length and return in this iteration
yield ep_len, ep_ret
#For discrete actions [cartpole environment]
def choose_best_action(obs,policy):
#print(f'obs ============ {obs}')
y = policy(obs).squeeze()
#print(f'y ============ {y}')
action = torch.argmax(y)
#print(f'action ============ {action}==== {action.item()}')
return action.item()
#Function to display a single trajectory
def display(observation,policy,env,is_discrete):
name = env.unwrapped.spec.id
ep_ret = 0
#this is specific to inverted pendulum environment
if name == 'Pendulum-v0':
env.env.state = observation[0:2]
obs = observation
else:
env.env.state = observation
obs = np.array(env.env.state)
#obs = observation
#print(f'obs ============= {obs}')
traj = [obs]
done = False
iter = 0
while not done:
#env.render()
if is_discrete:
action = choose_best_action(obs,policy)
else:
action = policy(obs).detach().numpy()
#print(f'action ============= {action}')
obs, rew, done, _ = env.step(action)
#print(f'obs x ============= {obs[0]}')
'''if(env.env.hull.angle > 2 or env.env.hull.angle < -0.8):
print(f'hull angle ============= {env.env.hull.angle}')'''
traj.append(obs)
#print(f'obs, action, rew, done ===={action} ===== {rew}==== {done}')
# Sum all episodic rewards as we go along
ep_ret += rew
iter = iter+1
print(f'Reward ============= {ep_ret}')
return ep_ret, traj, iter
def eval_policy(policy, env, render=False, is_discrete=False):
"""
The main function to evaluate our policy with. It will iterate a generator object
"rollout", which will simulate each episode and return the most recent episode's
length and return. We can then log it right after. And yes, eval_policy will run
forever until you kill the process.
Parameters:
policy - The trained policy to test, basically another name for our actor model
env - The environment to test the policy on
render - Whether we should render our episodes. False by default.
Return:
None
NOTE: To learn more about generators, look at rollout's function description
"""
# Rollout with the policy and environment, and log each episode's data
for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render, is_discrete)):
_log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
def collect_batches(policy, critic, env, render=False):
'''for ep_num, (ep_len, ep_ret) in enumerate(collect_failure_traces(policy, critic, env, render)):
_log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
if(ep_ret<0):
break'''
display(policy,critic,env,False)