add evaluation policy type (cts198859#22)

MARL-CEE-UW · Dec 27, 2019 · ae7442e · ae7442e
1 parent 5df300f
commit ae7442e
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Available NN layers:
 Fully-connected, LSTM.
 
 Available algorithms:
-IQL, IA2C, IA2C with stabilization (called MA2C).
+IQL, IA2C, IA2C with stabilization (called MA2C in this paper). For more advanced algorithms, please check [deeprl_network](https://github.com/cts198859/deeprl_network). 
 
 Available environments:
 * A 6-intersection benchmark traffic network. [Ye, Bao-Lin, et al. "A hierarchical model predictive control approach for signal splits optimization in large-scale urban road networks." IEEE Transactions on Intelligent Transportation Systems 17.8 (2016): 2182-2192.](https://ieeexplore.ieee.org/abstract/document/7406703/)
@@ -43,13 +43,13 @@ tensorboard --logdir=[base_dir]/log
 
 3. To evaluate and compare trained agents, run
 ~~~
-python3 main.py --base-dir [base_dir] evaluate --agents [agents] --evaluate-seeds [seeds]
+python3 main.py --base-dir [base_dir] evaluate --agents [agents] --evaluation-seeds [seeds]
 ~~~
-Evaluation data will be output to `[base_dir]/eva_data`, and make sure evaluation seeds are different from those used in training.
+Evaluation data will be output to `[base_dir]/eva_data`, and make sure evaluation seeds are different from those used in training. Under default evaluation setting, the inference policy of A2C is stochastic whereas that of Q-learning is greedy (deterministic). To explicitly specifiy the inference policy type, pass argument `--evaluation-policy-type [default/stochastic/deterministic]`. Please note running a determinisitc inference policy for A2C may cause the performance loss, due to the violation of "on-policy" learning.   
 
 4. To visualize the agent behavior, run
 ~~~
-python3 main.py --base-dir [base_dir] evaluate --agents [agent] --evaluate-seeds [seed] --demo
+python3 main.py --base-dir [base_dir] evaluate --agents [agent] --evaluation-seeds [seed] --demo
 ~~~
 It is recommended to have only one agent and one evaluation seed for the demo run. This will launch the SUMO GUI, and `./large_grid/data/view.xml` can be applied to visualize queue length and intersectin delay in edge color and thickness. Below are a few example screenshots.
 

diff --git a/agents/models.py b/agents/models.py
@@ -344,7 +344,7 @@ def backward(self, summary_writer=None, global_step=None):
                 else:
                     self.policy_ls[i].backward(self.sess, obs, acts, next_obs, dones, rs, cur_lr)
 
-    def forward(self, obs, mode='act'):
+    def forward(self, obs, mode='act', stochastic=False):
         if mode == 'explore':
             eps = self.eps_scheduler.get(1)
         action = []
@@ -354,7 +354,11 @@ def forward(self, obs, mode='act'):
             if (mode == 'explore') and (np.random.random() < eps):
                 action.append(np.random.randint(self.n_a_ls[i]))
             else:
-                action.append(np.argmax(qs))
+                if not stochastic:
+                    action.append(np.argmax(qs))
+                else:
+                    qs = qs / np.sum(qs)
+                    action.append(np.random.choice(np.arange(len(qs)), p=qs))
             qs_ls.append(qs)
         return action, qs_ls
 

diff --git a/main.py b/main.py
@@ -35,7 +35,9 @@ def parse_args():
     sp = subparsers.add_parser('evaluate', help="evaluate and compare agents under base dir")
     sp.add_argument('--agents', type=str, required=False,
                     default='naive', help="agent folder names for evaluation, split by ,")
-    sp.add_argument('--evaluate-seeds', type=str, required=False,
+    sp.add_argument('--evaluation-policy-type', type=str, required=False, default='default',
+                    help="inference policy type in evaluation: default, stochastic, or deterministic")
+    sp.add_argument('--evaluation-seeds', type=str, required=False,
                     default=','.join([str(i) for i in range(10000, 100001, 10000)]),
                     help="random seeds for evaluation, split by ,")
     sp.add_argument('--demo', action='store_true', help="shows SUMO gui")
@@ -153,7 +155,7 @@ def train(args):
     model.save(dirs['model'], final_step)
 
 
-def evaluate_fn(agent_dir, output_dir, seeds, port, demo):
+def evaluate_fn(agent_dir, output_dir, seeds, port, demo, policy_type):
     agent = agent_dir.split('/')[-1]
     if not check_dir(agent_dir):
         logging.error('Evaluation: %s does not exist!' % agent)
@@ -192,7 +194,7 @@ def evaluate_fn(agent_dir, output_dir, seeds, port, demo):
         model = greedy_policy
     env.agent = agent
     # collect evaluation data
-    evaluator = Evaluator(env, model, output_dir, demo=demo)
+    evaluator = Evaluator(env, model, output_dir, demo=demo, policy_type=policy_type)
     evaluator.run()
 
 
@@ -202,8 +204,9 @@ def evaluate(args):
     init_log(dirs['eva_log'])
     agents = args.agents.split(',')
     # enforce the same evaluation seeds across agents
-    seeds = args.evaluate_seeds
-    logging.info('Evaluation: random seeds: %s' % seeds)
+    seeds = args.evaluation_seeds
+    policy_type = args.evaluation_policy_type
+    logging.info('Evaluation: policy type: %s, random seeds: %s' % (policy_type, seeds))
     if not seeds:
         seeds = []
     else:
@@ -212,7 +215,7 @@ def evaluate(args):
     for i, agent in enumerate(agents):
         agent_dir = base_dir + '/' + agent
         thread = threading.Thread(target=evaluate_fn,
-                                  args=(agent_dir, dirs['eva_data'], seeds, i, args.demo))
+                                  args=(agent_dir, dirs['eva_data'], seeds, i, args.demo, policy_type))
         thread.start()
         threads.append(thread)
     for thread in threads:

diff --git a/utils.py b/utils.py
@@ -192,7 +192,7 @@ def explore(self, prev_ob, prev_done):
             R = 0
         return ob, done, R, rewards
 
-    def perform(self, test_ind, demo=False):
+    def perform(self, test_ind, demo=False, policy_type='default'):
         ob = self.env.reset(gui=demo, test_ind=test_ind)
         # note this done is pre-decision to reset LSTM states!
         done = True
@@ -202,17 +202,28 @@ def perform(self, test_ind, demo=False):
             if self.agent == 'greedy':
                 action = self.model.forward(ob)
             elif self.agent.endswith('a2c'):
+                # policy-based on-poicy learning
                 policy = self.model.forward(ob, done, 'p')
                 if self.agent == 'ma2c':
                     self.env.update_fingerprint(policy)
                 if self.agent == 'a2c':
-                    action = np.argmax(np.array(policy))
+                    if policy_type != 'deterministic':
+                        action = np.random.choice(np.arange(len(policy)), p=policy)
+                    else:
+                        action = np.argmax(np.array(policy))
                 else:
                     action = []
                     for pi in policy:
-                        action.append(np.argmax(np.array(pi)))
+                        if policy_type != 'deterministic':
+                            action.append(np.random.choice(np.arange(len(pi)), p=pi))
+                        else:
+                            action.append(np.argmax(np.array(pi)))
             else:
-                action, _ = self.model.forward(ob)
+                # value-based off-policy learning
+                if policy_type != 'stochastic':
+                    action, _ = self.model.forward(ob)
+                else:
+                    action, _ = self.model.forward(ob, stochastic=True)
             next_ob, reward, done, global_reward = self.env.step(action)
             rewards.append(global_reward)
             if done:
@@ -352,14 +363,15 @@ def run_online(self, coord):
 
 
 class Evaluator(Tester):
-    def __init__(self, env, model, output_path, demo=False):
+    def __init__(self, env, model, output_path, demo=False, policy_type='default'):
         self.env = env
         self.model = model
         self.agent = self.env.agent
         self.env.train_mode = False
         self.test_num = self.env.test_num
         self.output_path = output_path
         self.demo = demo
+        self.policy_type = policy_type
 
     def run(self):
         is_record = True
@@ -368,7 +380,7 @@ def run(self):
         self.env.init_data(is_record, record_stats, self.output_path)
         time.sleep(1)
         for test_ind in range(self.test_num):
-            reward, _ = self.perform(test_ind, demo=self.demo)
+            reward, _ = self.perform(test_ind, demo=self.demo, policy_type=self.policy_type)
             self.env.terminate()
             logging.info('test %i, avg reward %.2f' % (test_ind, reward))
             time.sleep(2)