-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_agent.py
209 lines (179 loc) · 7.46 KB
/
run_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import tensorflow.compat.v1 as tf
# from baselines.ppo2 import ppo2
from fruitbot_ppo import ppo_agent
from fruitbot_ppo.reg_impala_cnn import build_reg_impala_cnn
# Default CNN
#from baselines.common.models import build_impala_cnn
build_impala_cnn = build_reg_impala_cnn
from baselines.common.mpi_util import setup_mpi_gpus
from procgen import ProcgenEnv
from baselines.common.vec_env import (
VecExtractDictObs,
VecMonitor,
VecFrameStack,
VecNormalize
)
from baselines import logger
# from mpi4py import MPI
import argparse
# LOG_DIR = '/tmp/procgen'
LOG_DIR = 'models/'
tf.disable_v2_behavior()
'''
Global variables defaults, values can be changed via parser
All variables of interest which are desired to be tuned must be listed here
'''
#Hyperparameters
num_envs = 32
learning_rate = 5e-4
ent_coef = .01
gamma = .999
lam = .95
nsteps = 256
nminibatches = 8
ppo_epochs = 3
clip_range = .2
use_vf_clipping = True
#Important variables of interest
rew_scale = 1
rew_baseline = False
conv_fn = lambda x: build_impala_cnn(x, depths=[16,64,64], emb_size=256)
conv_fn_vals = [lambda x: build_impala_cnn(x, depths=[16, 32, 64], emb_size=256),
lambda x: build_impala_cnn(x, depths=[32, 32], emb_size=256),
lambda x: build_impala_cnn(x, depths=[16,32,32], emb_size=256)]
seeds = 1543
def main():
#Create argument parser
parser = argparse.ArgumentParser(
description='Process fruitbot_ppo agent training arguments.')
parser.add_argument('--env_name', type=str, default='fruitbot',
help='Provide an environment name available in procgen')
parser.add_argument('--distribution_mode', type=str, default='easy',
choices=["easy", "hard", "exploration", "memory", "extreme"])
parser.add_argument('--num_levels', type=int, default=50,
help='Number of levels to run in the environment')
parser.add_argument('--start_level', type=int, default=0,
help='The point in the list of levels available to the environment at \
which to index into, eg. --num_levels 50 --start_level 50 makes \
levels 50-99 available to this environment')
parser.add_argument('--timesteps_total', type=int, default=1_000_000,
help='The desired number of total timesteps spent training or testing')
parser.add_argument('--save_interval', type=int, default=0,
help='The interval spent in between checkpoints saved, 0 will save none,\
and 1 will save checkpoints after every model update.')
parser.add_argument('--load_path', type=str, default=None,
help='The relative or absolute path to a model checkpoint if an initial \
load from this checkpoint is desired')
parser.add_argument('--run_dir', type=str, default=LOG_DIR+"default",
help='The relative or absolute path to the directory where results should be logged')
parser.add_argument('--test_mode', type=bool, default=False,
help='True if the model should run as a testing agent, and should not be updated')
parser.add_argument('--variable_oi', type=str, default=None,
help='A global variable name of interest for hyperparameter searching')
parser.add_argument('--values_oi', type=float, nargs='+', default=None,
help='Values of interest for hyperparameter searching')
parser.add_argument('--num_envs', type=int, default=32,
help='The number of environments across which the agent should be run in parallel')
parser.add_argument('--epopt_timestep', type=int, default=0,
help='The number of timesteps to burn-in the model before it begins implementing EPOpt')
parser.add_argument('--paths', type=int, default=5,
help='The number of trajectories to explore in EPOpt')
args = parser.parse_args()
if args.variable_oi is not None and args.variable_oi not in globals().keys():
raise Exception("Invalid variable of interest - var must be in list:",
globals().keys())
if ((args.values_oi is None) and (args.variable_oi is None)):
learn_helper(args, seed = seeds)
return
elif ((args.values_oi is None) and (args.variable_oi is not None)):
if args.variable_oi == 'conv_fn':
valois = conv_fn_vals
else:
raise Exception('Invalid variable of interest and values pairing')
elif ((args.values_oi is not None) and (args.variable_oi is None)):
raise Exception('Invalid variable of interest and values pairing')
elif ((args.values_oi is not None) and (args.variable_oi is not None)):
valois = args.values_oi
for valoi in valois:
# with tf.get_default_graph().as_default():
learn_helper(args, args.variable_oi, valoi,
run_dir=args.run_dir+"_"+str(args.variable_oi)+"_"+str(valoi),
seed=seeds, save_once=True)
def learn_helper(args, voi=None, valoi=None, run_dir=None, seed=None, save_once=False):
#num_envs = args.num_envs
if (voi is not None) and (valoi is not None):
if isinstance(globals()[voi], int ):
globals()[voi] = int(valoi)
else:
globals()[voi] = valoi
timesteps_per_proc = args.timesteps_total
save_interval = args.save_interval
epopt_timestep = args.epopt_timestep
paths = args.paths
if save_once:
save_interval = timesteps_per_proc//(nsteps*num_envs)
print(save_interval)
load_path = args.load_path
if run_dir is None:
run_dir = args.run_dir
test_mode = args.test_mode
# comm = MPI.COMM_WORLD
# rank = comm.Get_rank()
num_levels = args.num_levels
# log_comm = comm.Split(0, 0)
format_strs = ['csv', 'stdout'] #if log_comm.Get_rank() == 0 else []
logger.configure(dir=run_dir, format_strs=format_strs)
print("num_envs" + str(num_envs))
logger.info("creating environment")
venv = ProcgenEnv(num_envs=num_envs,
env_name=args.env_name,
num_levels=num_levels,
start_level=args.start_level,
distribution_mode=args.distribution_mode)
venv = VecExtractDictObs(venv, "rgb")
venv = VecMonitor(
venv=venv, filename=None, keep_buf=100,
)
venv = VecNormalize(venv=venv, ob=False)
logger.info("creating tf session")
setup_mpi_gpus()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True #pylint: disable=E1101
sess = tf.Session(config=config)
sess.__enter__()
logger.info("training")
with sess.as_default():
sess.run(tf.global_variables_initializer())
ppo_agent.learn(
env=venv,
network=conv_fn,
total_timesteps=timesteps_per_proc,
test_mode=test_mode,
save_interval=save_interval,
seed=seed,
nsteps=nsteps,
nminibatches=nminibatches,
lam=lam,
gamma=gamma,
noptepochs=ppo_epochs,
log_interval=1,
ent_coef=ent_coef,
clip_vf=use_vf_clipping,
comm=None,
lr=learning_rate,
cliprange=clip_range,
update_fn=None,
init_fn=None,
vf_coef=0.5,
max_grad_norm=0.5,
rew_scale=rew_scale,
epopt_timestep=epopt_timestep,
paths = paths,
load_path=load_path
)
sess.close()
tf.get_variable_scope().reuse_variables()
# tf.reset_default_graph()
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.ERROR)
main()