-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
236 lines (186 loc) · 8.06 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import argparse
import os
import random
from envs import MappingEnvironment, LocalISM, RangeISM
from model import CNNActorCritic, MLPActorCritic, ResNetActorCritic, LinearActorCritic
from distributions import Multinomial
import torch
from torch import nn
from torch.autograd import Variable
import numpy as np
parser = argparse.ArgumentParser()
# General Stuff
parser.add_argument('--experiment', default='run0/', help='folder to put results of experiment in')
parser.add_argument('--cuda', action='store_true', help='enables cuda')
# Neural Network
parser.add_argument('--network', default='mlp', help='network type: mlp | cnn | resnet')
# Environment
parser.add_argument('--N', type=int, default=25, help='size of grid')
parser.add_argument('--map_p', type=float, default=.1, help='probability map location is occupied')
parser.add_argument('--prims', action='store_true', help='prims algorithm for filling in map')
# Sensor
parser.add_argument('--sensor_type', default='local', help='local | range')
parser.add_argument('--sensor_span', type=int, default=5, help='span of sensor')
parser.add_argument('--sensor_p', type=float, default=.8, help='probability sensor reading is correct')
# MDP
parser.add_argument('--gamma', type=float, default=.98, help='discount rate')
parser.add_argument('--episode_length', type=int, default=200, help='length of mapping environment episodes')
# Training
parser.add_argument('--N_episodes', type=int, default=1000, help='number of episodes to train for')
parser.add_argument('--max_steps', type=int, default=20, help='number of forward steps in A2C')
parser.add_argument('--optimizer', default='adam', help='sgd | adam | rmsprop')
parser.add_argument('--anneal_step_size', type=int, default=100, help='number of episodes until anneal learning rate')
parser.add_argument('--anneal_gamma', type=float, default=.5, help='annealing multiplicative factor')
parser.add_argument('--lr', type=float, default=1e-3, help='learning rate for ADAM optimizer')
parser.add_argument('--lambda_entropy', type=float, default=.01, help='entropy term coefficient')
parser.add_argument('--max_grad_norm', type=float, default=50., help='max gradient norm of actor_critic')
parser.add_argument('--seed', type=int, default=random.randint(0, 10000), help='random seed')
opt = parser.parse_args()
opt.cuda = opt.cuda and torch.cuda.is_available()
# set random seeds
random.seed(opt.seed)
np.random.seed(opt.seed)
torch.manual_seed(opt.seed)
# make experiment path
os.makedirs(opt.experiment, exist_ok=True)
with open(os.path.join(opt.experiment, 'config.txt'), 'w') as f:
f.write(str(opt))
# Initialize sensor
if opt.sensor_type == 'local':
ism_proto = lambda x: LocalISM(x, span=opt.sensor_span, p_correct=opt.sensor_p)
elif opt.sensor_type == 'range':
ism_proto = lambda x: RangeISM(x)
else:
raise Exception('sensor type not supported.')
# Initialize environment
env = MappingEnvironment(ism_proto, N=opt.N, p=opt.map_p, episode_length=opt.episode_length, prims=opt.prims)
# Initialize agent neural network
if opt.network == "DDDQN":
agent = DDDQN_Agent(x_axis=100, y_axis=100, height=4, num_actions = env.num_actions())
elif opt.network == 'cnn':
agent = CNNActorCritic(H_in = env.observation_size(), nc = env.num_channels(), na = env.num_actions())
elif opt.network == 'mlp':
agent = MLPActorCritic(H_in = env.observation_size(), nc = env.num_channels(), na = env.num_actions())
elif opt.network == 'resnet':
agent = ResNetActorCritic(H_in = env.observation_size(), nc = env.num_channels(), na = env.num_actions())
else:
raise Exception('network type not supported')
if opt.cuda:
agent = agent.cuda()
# Initialize optimizer and learning rate scheduler
if opt.network != "DDDQN":
if opt.optimizer == 'rmsprop':
agent_optimizer = torch.optim.RMSprop(agent.parameters(), lr=opt.lr)
elif opt.optimizer == 'adam':
agent_optimizer = torch.optim.Adam(agent.parameters(), lr=opt.lr)
elif opt.optimizer == 'sgd':
agent_optimizer = torch.optim.SGD(agent.parameters(), lr=opt.lr)
else:
raise Exception('optimizer not supported. Try rmsprop/adam/sgd')
# Initialize necessary variables
obs = env.reset()
done = False
t = 0
episodes = 0
ep_rewards = [0]
print ("Main training loop")
while episodes < opt.N_episodes:
t_start = t
rewards, observations, actions = [], [], []
# on-policy training loop for max_steps timesteps
while 1:
# Perform a_t according to agent
obs_npy = obs.transpose(2, 0, 1)[None, :]
obst = torch.Tensor(obs_npy)
if opt.cuda:
obst = obst.cuda()
obsv = Variable(obst)
agent.eval()
pa, V = actoragent_critic(obsv)
pa = Multinomial(pa)
a = pa.sample().data[0]
# Receive reward r_t and new state s_t+1
obs, reward, done, info = env.step(a.numpy())
t += 1
observations.append(obs_npy)
actions.append(a)
rewards.append(reward)
ep_rewards[-1] += reward
if done: # terminal s_t
R = 0
episodes += 1
obs = env.reset()
print ("Finished Episode %d:" % episodes, ep_rewards[-1], np.mean(ep_rewards[-50:]))
ep_rewards.append(0.)
if episodes > 0 and episodes % opt.anneal_step_size == 0:
print ("Annealing learning rate: %.7f to %.7f" % (opt.lr, opt.lr*opt.anneal_gamma))
opt.lr *= opt.anneal_gamma
if opt.optimizer == 'rmsprop':
agent_optimizer = torch.optim.RMSprop(agent.parameters(), lr=opt.lr)
elif opt.optimizer == 'adam':
agent_optimizer = torch.optim.Adam(agent.parameters(), lr=opt.lr)
elif opt.optimizer == 'sgd':
agent_optimizer = torch.optim.SGD(agent.parameters(), lr=opt.lr)
break
if t - t_start == opt.max_steps: # reached num. forward steps
R = V.data[0]
break
# accumulate rewards for advantage calculation
i = len(rewards)-1
for r in rewards[::-1]:
R = rewards[i] + opt.gamma*R
rewards[i] = R
i -= 1
actions_t = torch.Tensor(actions).type(torch.LongTensor)
if opt.cuda:
actions_t = actions_t.cuda()
actions_v = Variable(actions_t)
rewards_t = torch.Tensor(rewards)
if opt.cuda:
rewards_t = rewards_t.cuda()
rewards_v = Variable(rewards_t)
observations_npy = np.concatenate(observations)
observations_t = torch.Tensor(observations_npy)
if opt.cuda:
observations_t = observations_t.cuda()
observations_v = Variable(observations_t)
agent.train()
pa, V = agent(observations_v)
pa_multinomial = Multinomial(pa)
agent.zero_grad()
# gradient step
policy_loss = (-pa_multinomial.log_prob(actions_v) * (rewards_v - V.detach())).mean()
value_loss = (rewards_v - V).pow(2).mean()
entropy = -torch.sum(pa * torch.log(pa), dim=1).mean()
(policy_loss + value_loss - opt.lambda_entropy * entropy).backward()
torch.nn.utils.clip_grad_norm(agent.parameters(), opt.max_grad_norm)
agent_optimizer.step()
np.save(os.path.join(opt.experiment, 'results'), ep_rewards)
if episodes % 1000 == 0:
torch.save(agent.state_dict(), os.path.join(opt.experiment, 'agentepisode%d.torch' % episodes))
torch.save(agent.state_dict(), os.path.join(opt.experiment, 'agent_episode%d.torch' % episodes))
np.save(os.path.join(opt.experiment, 'results'), ep_rewards)
rewards = []
for k in range(1000):
obs = env.reset()
env.render(reset=True)
done = False
R = 0
while not done:
# Perform a_t according to agent
env.render()
obs_npy = obs.transpose(2, 0, 1)[None, :]
obst = torch.Tensor(obs_npy)
if opt.cuda:
obst = obst.cuda()
obsv = Variable(obst)
agent.eval()
pa, V = agent(obsv)
pa = Multinomial(pa)
a = pa.sample().data[0]
# Receive reward r_t and new state s_t+1
obs, reward, done, info = env.step(a)
R += reward
print (R)
rewards.append(R)
np.save(os.path.join(opt.experiment, 'rewards_test'), rewards)