Skip to content

Commit

Permalink
add evaluation policy type (cts198859#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
cts198859 authored Dec 27, 2019
1 parent 5df300f commit ae7442e
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 18 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Available NN layers:
Fully-connected, LSTM.

Available algorithms:
IQL, IA2C, IA2C with stabilization (called MA2C).
IQL, IA2C, IA2C with stabilization (called MA2C in this paper). For more advanced algorithms, please check [deeprl_network](https://github.com/cts198859/deeprl_network).

Available environments:
* A 6-intersection benchmark traffic network. [Ye, Bao-Lin, et al. "A hierarchical model predictive control approach for signal splits optimization in large-scale urban road networks." IEEE Transactions on Intelligent Transportation Systems 17.8 (2016): 2182-2192.](https://ieeexplore.ieee.org/abstract/document/7406703/)
Expand Down Expand Up @@ -43,13 +43,13 @@ tensorboard --logdir=[base_dir]/log

3. To evaluate and compare trained agents, run
~~~
python3 main.py --base-dir [base_dir] evaluate --agents [agents] --evaluate-seeds [seeds]
python3 main.py --base-dir [base_dir] evaluate --agents [agents] --evaluation-seeds [seeds]
~~~
Evaluation data will be output to `[base_dir]/eva_data`, and make sure evaluation seeds are different from those used in training.
Evaluation data will be output to `[base_dir]/eva_data`, and make sure evaluation seeds are different from those used in training. Under default evaluation setting, the inference policy of A2C is stochastic whereas that of Q-learning is greedy (deterministic). To explicitly specifiy the inference policy type, pass argument `--evaluation-policy-type [default/stochastic/deterministic]`. Please note running a determinisitc inference policy for A2C may cause the performance loss, due to the violation of "on-policy" learning.

4. To visualize the agent behavior, run
~~~
python3 main.py --base-dir [base_dir] evaluate --agents [agent] --evaluate-seeds [seed] --demo
python3 main.py --base-dir [base_dir] evaluate --agents [agent] --evaluation-seeds [seed] --demo
~~~
It is recommended to have only one agent and one evaluation seed for the demo run. This will launch the SUMO GUI, and `./large_grid/data/view.xml` can be applied to visualize queue length and intersectin delay in edge color and thickness. Below are a few example screenshots.

Expand Down
8 changes: 6 additions & 2 deletions agents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def backward(self, summary_writer=None, global_step=None):
else:
self.policy_ls[i].backward(self.sess, obs, acts, next_obs, dones, rs, cur_lr)

def forward(self, obs, mode='act'):
def forward(self, obs, mode='act', stochastic=False):
if mode == 'explore':
eps = self.eps_scheduler.get(1)
action = []
Expand All @@ -354,7 +354,11 @@ def forward(self, obs, mode='act'):
if (mode == 'explore') and (np.random.random() < eps):
action.append(np.random.randint(self.n_a_ls[i]))
else:
action.append(np.argmax(qs))
if not stochastic:
action.append(np.argmax(qs))
else:
qs = qs / np.sum(qs)
action.append(np.random.choice(np.arange(len(qs)), p=qs))
qs_ls.append(qs)
return action, qs_ls

Expand Down
15 changes: 9 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def parse_args():
sp = subparsers.add_parser('evaluate', help="evaluate and compare agents under base dir")
sp.add_argument('--agents', type=str, required=False,
default='naive', help="agent folder names for evaluation, split by ,")
sp.add_argument('--evaluate-seeds', type=str, required=False,
sp.add_argument('--evaluation-policy-type', type=str, required=False, default='default',
help="inference policy type in evaluation: default, stochastic, or deterministic")
sp.add_argument('--evaluation-seeds', type=str, required=False,
default=','.join([str(i) for i in range(10000, 100001, 10000)]),
help="random seeds for evaluation, split by ,")
sp.add_argument('--demo', action='store_true', help="shows SUMO gui")
Expand Down Expand Up @@ -153,7 +155,7 @@ def train(args):
model.save(dirs['model'], final_step)


def evaluate_fn(agent_dir, output_dir, seeds, port, demo):
def evaluate_fn(agent_dir, output_dir, seeds, port, demo, policy_type):
agent = agent_dir.split('/')[-1]
if not check_dir(agent_dir):
logging.error('Evaluation: %s does not exist!' % agent)
Expand Down Expand Up @@ -192,7 +194,7 @@ def evaluate_fn(agent_dir, output_dir, seeds, port, demo):
model = greedy_policy
env.agent = agent
# collect evaluation data
evaluator = Evaluator(env, model, output_dir, demo=demo)
evaluator = Evaluator(env, model, output_dir, demo=demo, policy_type=policy_type)
evaluator.run()


Expand All @@ -202,8 +204,9 @@ def evaluate(args):
init_log(dirs['eva_log'])
agents = args.agents.split(',')
# enforce the same evaluation seeds across agents
seeds = args.evaluate_seeds
logging.info('Evaluation: random seeds: %s' % seeds)
seeds = args.evaluation_seeds
policy_type = args.evaluation_policy_type
logging.info('Evaluation: policy type: %s, random seeds: %s' % (policy_type, seeds))
if not seeds:
seeds = []
else:
Expand All @@ -212,7 +215,7 @@ def evaluate(args):
for i, agent in enumerate(agents):
agent_dir = base_dir + '/' + agent
thread = threading.Thread(target=evaluate_fn,
args=(agent_dir, dirs['eva_data'], seeds, i, args.demo))
args=(agent_dir, dirs['eva_data'], seeds, i, args.demo, policy_type))
thread.start()
threads.append(thread)
for thread in threads:
Expand Down
24 changes: 18 additions & 6 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def explore(self, prev_ob, prev_done):
R = 0
return ob, done, R, rewards

def perform(self, test_ind, demo=False):
def perform(self, test_ind, demo=False, policy_type='default'):
ob = self.env.reset(gui=demo, test_ind=test_ind)
# note this done is pre-decision to reset LSTM states!
done = True
Expand All @@ -202,17 +202,28 @@ def perform(self, test_ind, demo=False):
if self.agent == 'greedy':
action = self.model.forward(ob)
elif self.agent.endswith('a2c'):
# policy-based on-poicy learning
policy = self.model.forward(ob, done, 'p')
if self.agent == 'ma2c':
self.env.update_fingerprint(policy)
if self.agent == 'a2c':
action = np.argmax(np.array(policy))
if policy_type != 'deterministic':
action = np.random.choice(np.arange(len(policy)), p=policy)
else:
action = np.argmax(np.array(policy))
else:
action = []
for pi in policy:
action.append(np.argmax(np.array(pi)))
if policy_type != 'deterministic':
action.append(np.random.choice(np.arange(len(pi)), p=pi))
else:
action.append(np.argmax(np.array(pi)))
else:
action, _ = self.model.forward(ob)
# value-based off-policy learning
if policy_type != 'stochastic':
action, _ = self.model.forward(ob)
else:
action, _ = self.model.forward(ob, stochastic=True)
next_ob, reward, done, global_reward = self.env.step(action)
rewards.append(global_reward)
if done:
Expand Down Expand Up @@ -352,14 +363,15 @@ def run_online(self, coord):


class Evaluator(Tester):
def __init__(self, env, model, output_path, demo=False):
def __init__(self, env, model, output_path, demo=False, policy_type='default'):
self.env = env
self.model = model
self.agent = self.env.agent
self.env.train_mode = False
self.test_num = self.env.test_num
self.output_path = output_path
self.demo = demo
self.policy_type = policy_type

def run(self):
is_record = True
Expand All @@ -368,7 +380,7 @@ def run(self):
self.env.init_data(is_record, record_stats, self.output_path)
time.sleep(1)
for test_ind in range(self.test_num):
reward, _ = self.perform(test_ind, demo=self.demo)
reward, _ = self.perform(test_ind, demo=self.demo, policy_type=self.policy_type)
self.env.terminate()
logging.info('test %i, avg reward %.2f' % (test_ind, reward))
time.sleep(2)
Expand Down

0 comments on commit ae7442e

Please sign in to comment.