-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainers.py
151 lines (125 loc) · 5.77 KB
/
trainers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
from models import PPOAgent
from utils import *
import pickle
class PPOMultiAgentTrainer:
# def __init__(self, env, neuralNetSpecs, learningRate, modelPath=None):
def __init__(self, env, **kwargs):
self.env = env
self.numAgents = env.numAgents
self.actionDim = env.action_space.n
if 'modelPath' in kwargs:
modelPath = kwargs.get('modelPath')
with open(modelPath, 'rb') as fp:
self.agents = pickle.load(fp)
self.stateSpaceDim = np.prod(env.observation_space.shape)
else:
assert 'neuralNetSpecs' in kwargs
neuralNetSpecs = kwargs.get('neuralNetSpecs')
if 'learningRate' in kwargs:
learningRate = kwargs.get('learningRate')
else:
learningRate = 1e-4
if isinstance(neuralNetSpecs[0], ProtoConvNet):
self.stateSpaceDim = env.observation_space.shape
else:
self.stateSpaceDim = np.prod(env.observation_space.shape)
self.isRecurrent = any(isinstance(net, ProtoLSTMNet) for net in neuralNetSpecs)
self.agents = []
for i in range(self.numAgents):
self.agents.append(PPOAgent(self.stateSpaceDim, self.actionDim, neuralNetSpecs, learningRate))
self.totalSteps = 0
self.rewardsHistory = []
def interact(self, observations, rnnState, training=False):
self.totalSteps += 1
actions = [{} for _ in range(self.numAgents)]
for k in range(self.numAgents):
if observations[k] is not None:
observation = np.reshape(observations[k], newshape=(-1, self.stateSpaceDim)).astype(np.float32)
actions[k], rnnState[k] = self.agents[k].act(observation, rnnState[k][0], rnnState[k][1])
newObservations, rewards, done, _ = self.env.step(actions)
if not training:
self.env.render()
done = np.logical_or.reduce(done)
return newObservations, rewards, done
def test(self, interactionLength):
observations = self.env.reset()
rnnState = []
for k in range(self.numAgents):
rnnState.append((np.zeros((1, 1, 128)), np.zeros((1, 1, 128))))
for t in range(interactionLength):
newObservations, rewards, done = self.interact(observations, rnnState)
if done:
break
observations = newObservations
def train(self, maxEpisodes, maxEpisodeLength, logPeriod, savePeriod, savePath):
cumRewards = np.zeros((self.numAgents,))
avgEpisodeLength = 0
for episode in range(1, maxEpisodes + 1):
observations = self.env.reset()
rnnState = []
for k in range(self.numAgents):
rnnState.append((np.zeros((1, 1, 128)), np.zeros((1, 1, 128))))
for t in range(maxEpisodeLength):
newObservations, rewards, done = self.interact(observations, rnnState, training=True)
for k in range(self.numAgents):
if observations[k] is not None:
self.agents[k].memory['rewards'].append(rewards[k])
self.agents[k].memory['terminalFlags'].append(done)
cumRewards += rewards
if done:
break
observations = newObservations
avgEpisodeLength += t
self.learn()
if episode % logPeriod == 0:
avgEpisodeLength = int(avgEpisodeLength / logPeriod)
cumRewards = (cumRewards / logPeriod)
print('Episode {} \t avg length: {} \t reward: {}'.format(episode, avgEpisodeLength, cumRewards))
cumRewards = np.zeros((self.numAgents,))
avgEpisodeLength = 0
if episode % savePeriod == 0:
print("Saved model")
with open(savePath, 'wb') as fp:
pickle.dump(self.agents, fp)
def learn(self):
for k in range(self.numAgents):
self.agents[k].learn(numEpochs=4)
self.agents[k].clearMemory()
class PPOTrainer:
def __init__(self, env, neuralNetSpecs, learningRate):
self.env = env
self.actionDim = env.action_space.n
self.stateSpaceDim = np.prod(env.observation_space.shape)
self.agent = PPOAgent(self.stateSpaceDim, self.actionDim, neuralNetSpecs, learningRate)
self.totalSteps = 0
self.rewardsHistory = []
def train(self, maxEpisodes, maxEpisodeLength, updatePeriod, logPeriod, render=False):
cumReward = 0
avgEpisodeLength = 0
for episode in range(1, maxEpisodes + 1):
observation = self.env.reset()
for t in range(maxEpisodeLength):
self.totalSteps += 1
action = self.agent.act(observation)
newObservation, reward, done, _ = self.env.step(action)
if render:
self.env.render()
self.agent.memory['rewards'].append(reward)
self.agent.memory['terminalFlags'].append(done)
if self.totalSteps % updatePeriod == 0:
self.learn()
cumReward += reward
if done:
break
observation = newObservation
avgEpisodeLength += t
if episode % logPeriod == 0:
avgEpisodeLength = int(avgEpisodeLength / logPeriod)
cumReward = int((cumReward / logPeriod))
print('Episode {} \t avg length: {} \t reward: {}'.format(episode, avgEpisodeLength, cumReward))
cumReward = 0
avgEpisodeLength = 0
def learn(self):
self.agent.learn(numEpochs=4)
self.agent.clearMemory()