-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
102 lines (72 loc) · 2.94 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import argparse
import gym
import torch
from gym.wrappers import AtariPreprocessing
import config
from utils import set_device, PongActionWrapper
device = set_device()
parser = argparse.ArgumentParser()
parser.add_argument('--env', choices=['CartPole-v0', 'CartPole-v1', 'Pong-v0'])
parser.add_argument('--path', type=str, help='Path to stored DQN model.')
parser.add_argument('--n_eval_episodes', type=int, default=1, help='Number of evaluation episodes.', nargs='?')
parser.add_argument('--render', dest='render', action='store_true', help='Render the environment.')
parser.add_argument('--save_video', dest='save_video', action='store_true', help='Save the episodes as video.')
parser.set_defaults(render=False)
parser.set_defaults(save_video=False)
# Hyperparameter configurations for different environments. See config.py.
ENV_CONFIGS = {
'CartPole-v0': config.CartPole,
'CartPole-v1': config.CartPole,
'Pong-v0': config.Pong
}
def evaluate_policy(dqn, env, env_config, args, n_episodes, render=False, verbose=False):
"""Runs {n_episodes} episodes to evaluate current policy."""
total_return = 0
for i in range(n_episodes):
obs = torch.tensor(env.reset(), device=device, dtype=torch.float32)
extra_dim = 1
obs_stack = torch.cat(env_config["obs_stack_size"] * [obs]).unsqueeze(0).to(device)
done = False
episode_return = 0
while not done:
if render:
env.render()
action = dqn.act(obs_stack, exploit=True)
action = action.item()
obs, reward, done, info = env.step(action)
if not done:
obs = torch.tensor(obs, device=device)
obs_stack = torch.cat((obs_stack[:, 84:, ...], obs.unsqueeze(0)), dim=extra_dim).to(device)
episode_return += reward
total_return += episode_return
if verbose:
print(f'Finished episode {i+1} with a total return of {episode_return}')
return total_return / n_episodes
if __name__ == '__main__':
args = parser.parse_args()
print(f'Device type in use: {device}')
# Initialize environment and config
env = gym.make(args.env)
env_config = ENV_CONFIGS[args.env]
if args.save_video:
env = gym.wrappers.Monitor(env, './video/', video_callable=lambda episode_id: True, force=True)
# Load model from provided path.
dqn = torch.load(args.path, map_location=device)
dqn.eval()
env = AtariPreprocessing(
env,
screen_size=env_config['screen_size'],
grayscale_obs=True,
frame_skip=1,
noop_max=30)
env = PongActionWrapper(env)
mean_return = evaluate_policy(
dqn,
env,
env_config,
args,
args.n_eval_episodes,
render=args.render and not args.save_video,
verbose=True)
print(f'The policy got a mean return of {mean_return} over {args.n_eval_episodes} episodes.')
env.close()