Merge branch 'master' into feature_engineering

PrimeF · Mar 24, 2021 · b7c6657 · b7c6657
2 parents 7fa011f + 2b1efba
commit b7c6657
Show file tree

Hide file tree

Showing 9 changed files with 96 additions and 135 deletions.
diff --git a/agent_code/auto_bomber/auto_bomber_config.py b/agent_code/auto_bomber/auto_bomber_config.py
diff --git a/agent_code/auto_bomber/callbacks.py b/agent_code/auto_bomber/callbacks.py
@@ -1,10 +1,10 @@
 import random
-import agent_code.auto_bomber.auto_bomber_config as config
 
 import numpy as np
 
-from agent_code.auto_bomber.model import LinearAutoBomberModel
 from agent_code.auto_bomber.feature_engineering import state_to_features
+from agent_code.auto_bomber.model import LinearAutoBomberModel
+
 
 def setup(self):
     """
@@ -33,14 +33,15 @@ def act(self, game_state: dict) -> str:
     :return: The action to take as a string.
     """
 
-    if self.train and config.POLICY == 'SOFTMAX':
+    hyper_parameters = self.model.hyper_parameters
+    if self.train and hyper_parameters["policy"] == 'SOFTMAX':
         self.model.select_best_action(game_state, self, softmax=True)
-    elif self.train and random.random() < config.EPSILON:
-        if config.POLICY == 'GREEDY':
+    elif self.train and random.random() < hyper_parameters["epsilon"]:
+        if hyper_parameters["policy"] == 'GREEDY':
             self.logger.debug("Choosing action purely at random.")
             # 80%: walk in any direction. 10% wait. 10% bomb.
-            return np.random.choice(config.ACTIONS, p=[.2, .2, .2, .2, .1, .1])
-        elif config.POLICY == 'IANN':
+            return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1])
+        elif hyper_parameters["policy"] == 'IANN':
             self.model.select_best_action(game_state, self, softmax=True)
     else:
         self.logger.debug("Querying model for action.")

diff --git a/agent_code/auto_bomber/default_hyper_parameters.json b/agent_code/auto_bomber/default_hyper_parameters.json
@@ -0,0 +1,28 @@
+{
+  "actions": [
+    "UP",
+    "RIGHT",
+    "DOWN",
+    "LEFT",
+    "WAIT",
+    "BOMB"
+  ],
+  "epsilon": 0.35,
+  "discount": 0.95,
+  "learning_rate": 0.01,
+  "policy": "SOFTMAX",
+  "temperature": 0.5,
+  "region_size": 2,
+  "region_time_tolerance": 8,
+  "game_rewards": {
+    "CRATE_DESTROYED": 20,
+    "COIN_FOUND": 20,
+    "COIN_COLLECTED": 50,
+    "KILLED_OPPONENT": 200,
+    "INVALID_ACTION": -5,
+    "KILLED_SELF": -300,
+    "GOT_KILLED": -200,
+    "SURVIVED_ROUND": 300,
+    "SAME_REGION": -20
+  }
+}
diff --git a/agent_code/auto_bomber/model.py b/agent_code/auto_bomber/model.py
@@ -1,12 +1,21 @@
-import shutil
+import json
+import os
 import pickle
+import shutil
 from pathlib import Path
 
 import numpy as np
 from tensorboardX import SummaryWriter
 
+import agent_code.auto_bomber.model_path as model_path
 from agent_code.auto_bomber.transitions import Transitions
-import agent_code.auto_bomber.auto_bomber_config as config
+
+
+def get_model_dir():
+    try:
+        return os.environ["MODEL_DIR"]
+    except KeyError as e:
+        return model_path.MODEL_DIR
 
 
 class LinearAutoBomberModel:
@@ -15,28 +24,37 @@ def __init__(self, train, feature_extractor):
         self.weights = None
         self.feature_extractor = feature_extractor
 
-        if config.MODEL_DIR and Path(config.MODEL_DIR).is_dir():
-            self.model_dir = Path(config.MODEL_DIR)
-        elif config.MODEL_DIR and not Path(config.MODEL_DIR).is_dir():
+        model_dir = get_model_dir()
+        if model_dir and Path(model_dir).is_dir():
+            self.model_dir = Path(model_dir)
+        elif model_dir and not Path(model_dir).is_dir():
             raise FileNotFoundError("The specified model directory does not exist!\nIf you wish to train a NEW model"
                                     "set parameter to None, otherwise specify a valid model directory.")
-        elif not self.train and not config.MODEL_DIR:
+        elif not self.train and not model_dir:
             raise ValueError("No model directory has been specified.\n A model directory is required for inference.")
         else:
-            model_index = sorted([int(x.stem) for x in Path(config.MODELS_ROOT).iterdir() if x.is_dir()])[-1]
+            root_dir = Path(model_path.MODELS_ROOT)
+            root_dir.mkdir(parents=True, exist_ok=True)
+            existing_subdirs = sorted([int(x.stem) for x in root_dir.iterdir() if x.is_dir()])
+
+            model_index = existing_subdirs[-1] if existing_subdirs else -1
             model_index += 1
-            self.model_dir = Path(config.MODELS_ROOT) / str(model_index)
+            self.model_dir = Path(model_path.MODELS_ROOT) / str(model_index)
             self.model_dir.mkdir()
+            # Copy configuration file for logging purposes
+            shutil.copy(Path("default_hyper_parameters.json"), self.model_dir / "hyper_parameters.json")
 
         self.weights_path = self.model_dir / "weights.pt"
         if self.weights_path.is_file():
             with self.weights_path.open(mode="rb") as file:
                 self.weights = pickle.load(file)
 
-        if self.train:
-            # Copy configuration file for logging purposes
-            shutil.copy(Path("./auto_bomber_config.py"), self.model_dir / "config.py")
+        hyper_parameters_path = self.model_dir / "hyper_parameters.json"
+        if hyper_parameters_path.is_file():
+            with hyper_parameters_path.open(mode="rb") as file:
+                self.hyper_parameters = json.load(file)
 
+        if self.train:
             self.writer = SummaryWriter(logdir=f"./runs/exp{self.model_dir.stem}")
 
     def store(self):
@@ -51,17 +69,18 @@ def select_best_action(self, game_state: dict, agent_self, softmax=False):
 
         if softmax:
             sort_actions = q_action_values.argsort()
-            p = np.exp(sort_actions / config.TEMP) / np.sum(np.exp(sort_actions / config.TEMP))
+            temp = self.hyper_parameters["temperature"]
+            p = np.exp(sort_actions / temp) / np.sum(np.exp(sort_actions / temp))
             choice = np.random.choice(sort_actions, p=p)
         else:
             top_3_actions = q_action_values.argsort()[-3:][::-1]
             choice = np.random.choice(top_3_actions, p=[0.9, 0.05, 0.05])
-        return config.ACTIONS[choice]
+        return self.hyper_parameters["actions"][choice]
 
     def fit_model_with_transition_batch(self, transitions: Transitions, round: int):
         loss = []
-        numpy_transitions = transitions.to_numpy_transitions()
-        for action_id, action in enumerate(config.ACTIONS):
+        numpy_transitions = transitions.to_numpy_transitions(self.hyper_parameters)
+        for action_id, action in enumerate(self.hyper_parameters["actions"]):
             x_all_t, y_all_t = numpy_transitions.get_features_and_value_estimates(action)
 
             if x_all_t.size != 0:
@@ -70,7 +89,7 @@ def fit_model_with_transition_batch(self, transitions: Transitions, round: int):
                 loss.append(np.mean(residuals ** 2))
                 q_grad = np.dot(x_all_t.T, residuals)
 
-                weight_updates = config.LEARNING_RATE / y_all_t.shape[0] * q_grad
+                weight_updates = self.hyper_parameters["learning_rate"] / y_all_t.shape[0] * q_grad
                 self.weights[action_id] += weight_updates
 
         mean_loss = np.mean(loss)
@@ -83,4 +102,5 @@ def init_if_needed(self, features_x, agent_self):
             agent_self.logger.info("Model is empty init with random weights.")
 
             # Xavier weights initialization
-            self.weights = np.random.rand(len(config.ACTIONS), len(features_x)) * np.sqrt(1 / len(features_x))
+            self.weights = np.random.rand(len(self.hyper_parameters["actions"]),
+                                          len(features_x)) * np.sqrt(1 / len(features_x))
diff --git a/agent_code/auto_bomber/model_path.py b/agent_code/auto_bomber/model_path.py
@@ -0,0 +1,2 @@
+MODELS_ROOT = "./models"
+MODEL_DIR = None
diff --git a/agent_code/auto_bomber/test_transitions.py b/agent_code/auto_bomber/test_transitions.py
diff --git a/agent_code/auto_bomber/train.py b/agent_code/auto_bomber/train.py
@@ -1,14 +1,10 @@
-import numpy as np
-from collections import namedtuple, defaultdict
+from queue import Queue
 from typing import List
 
-from agent_code.auto_bomber.feature_engineering import state_to_features
 from agent_code.auto_bomber import custom_events as ce
-
+from agent_code.auto_bomber.feature_engineering import state_to_features
 # This is only an example!
 from agent_code.auto_bomber.transitions import Transitions
-import agent_code.auto_bomber.auto_bomber_config as config
-from queue import Queue
 
 
 def setup_training(self):
@@ -21,7 +17,9 @@ def setup_training(self):
     """
     # Example: Setup an array that will note transition tuples
     self.transitions = Transitions(state_to_features)
-    self.q = Queue(maxsize=config.REGION_TIME_TOLERANCE)
+
+    self.q = Queue(maxsize=self.model.hyper_parameters["region_time_tolerance"])
+
 
 def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_state: dict, events: List[str]):
     """
@@ -42,15 +40,15 @@ def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_
     :param events: The events that occurred when going from  `old_game_state` to `new_game_state`
     """
     self.logger.debug(f'Encountered game event(s) {", ".join(map(repr, events))} in step {new_game_state["step"]}')
-
     # state_to_features is defined in callbacks.py
     self.transitions.add_transition(old_game_state, last_action, new_game_state, reward_from_events(self, events))
     # Punishment, if agent is still in the same radius after certain time steps
     new_position = new_game_state["self"][3]
+    region_size = self.model.hyper_parameters["region_size"]
     if self.q.full():
         old_position = self.q.get()
-        if (old_position[0] - config.REGION_SIZE <= new_position[0] <= old_position[0] + config.REGION_SIZE) \
-                or (old_position[1] - config.REGION_SIZE <= new_position[1] <= old_position[1] + config.REGION_SIZE):
+        if (old_position[0] - region_size <= new_position[0] <= old_position[0] + region_size) \
+                or (old_position[1] - region_size <= new_position[1] <= old_position[1] + region_size):
             events.append(ce.SAME_REGION)
     self.q.put(new_position)
 
@@ -87,9 +85,11 @@ def reward_from_events(self, events: List[str]) -> int:
     certain behavior.
     """
     # q: how to determine the winner?
+
+    rewards_dict = self.model.hyper_parameters["game_rewards"]
     reward_sum = 0
     for event in events:
-        if event in config.game_rewards:
-            reward_sum += config.game_rewards[event]
+        if event in rewards_dict:
+            reward_sum += rewards_dict[event]
     self.logger.info(f"Awarded {reward_sum} for events {', '.join(events)}")
     return reward_sum
diff --git a/agent_code/auto_bomber/transitions.py b/agent_code/auto_bomber/transitions.py
@@ -1,5 +1,4 @@
 import numpy as np
-import agent_code.auto_bomber.auto_bomber_config as config
 
 
 class Transitions:
@@ -17,8 +16,8 @@ def add_transition(self, old_game_state, action, new_game_state, rewards):
         self.next_states.append(self.feature_extractor(new_game_state))
         self.rewards.append(rewards)
 
-    def to_numpy_transitions(self):
-        return NumpyTransitions(self)
+    def to_numpy_transitions(self, hyper_parameters):
+        return NumpyTransitions(self, hyper_parameters)
 
     def clear(self):
         self.states.clear()
@@ -29,11 +28,12 @@ def clear(self):
 
 class NumpyTransitions:
     # todo add hyperparam for batch size to support TD-n-step and monte-carlo
-    def __init__(self, transitions):
+    def __init__(self, transitions, hyper_parameters):
         self.states = np.asarray(transitions.states, dtype=np.float32)
         self.actions = np.asarray(transitions.actions)
         self.next_states = np.asarray(transitions.next_states, dtype=np.float32)
         self.rewards = np.asarray(transitions.rewards, dtype=np.float32)
+        self.hyper_parameters = hyper_parameters
 
     def get_time_steps_for_action(self, action):
         return np.argwhere(self.actions == action)
@@ -47,6 +47,6 @@ def get_features_and_value_estimates(self, action):
 
     def monte_carlo_value_estimation(self, time_step_start: int):
         relevant_rewards = self.rewards[time_step_start:]
-        discounts = np.fromfunction(lambda i: config.DISCOUNT ** i, shape=(len(relevant_rewards),), dtype=np.float32)
+        discounts = np.fromfunction(lambda i: self.hyper_parameters["discount"] ** i,
+                                    shape=(len(relevant_rewards),), dtype=np.float32)
         return np.sum(discounts * relevant_rewards)
-
diff --git a/train_scripts/coins_only.sh b/train_scripts/coins_only.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+python3 main.py play --agents auto_bomber --train 1 --n-rounds 100000 --no-gui
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/usr/bin/env bash

		python3 main.py play --agents auto_bomber --train 1 --n-rounds 100000 --no-gui