diff --git a/agent_code/auto_bomber/auto_bomber_config.py b/agent_code/auto_bomber/auto_bomber_config.py deleted file mode 100644 index b2122cac2..000000000 --- a/agent_code/auto_bomber/auto_bomber_config.py +++ /dev/null @@ -1,28 +0,0 @@ -import events as e -from agent_code.auto_bomber import custom_events as ce - - -MODELS_ROOT = "./models" -# MODEL_DIR = "./models/63" -MODEL_DIR = None -ACTIONS = ['UP', 'RIGHT', 'DOWN', 'LEFT', 'WAIT', 'BOMB'] -EPSILON = 0.35 -DISCOUNT = 0.95 -LEARNING_RATE = 0.01 -POLICY = 'IANN' -TEMPERATURE = 0.5 -REGION_SIZE = 2 -REGION_TIME_TOLERANCE = 8 - -game_rewards = { - e.CRATE_DESTROYED: 20, - # e.BOMB_DROPPED: 20, - e.COIN_FOUND: 20, - e.COIN_COLLECTED: 50, - e.KILLED_OPPONENT: 200, - e.INVALID_ACTION: -5, - e.KILLED_SELF: -300, - e.GOT_KILLED: -200, - e.SURVIVED_ROUND: 300, - ce.SAME_REGION: -20 -} diff --git a/agent_code/auto_bomber/callbacks.py b/agent_code/auto_bomber/callbacks.py index d8dd14bf0..92db83e3f 100644 --- a/agent_code/auto_bomber/callbacks.py +++ b/agent_code/auto_bomber/callbacks.py @@ -1,10 +1,10 @@ import random -import agent_code.auto_bomber.auto_bomber_config as config import numpy as np -from agent_code.auto_bomber.model import LinearAutoBomberModel from agent_code.auto_bomber.feature_engineering import state_to_features +from agent_code.auto_bomber.model import LinearAutoBomberModel + def setup(self): """ @@ -33,14 +33,15 @@ def act(self, game_state: dict) -> str: :return: The action to take as a string. """ - if self.train and config.POLICY == 'SOFTMAX': + hyper_parameters = self.model.hyper_parameters + if self.train and hyper_parameters["policy"] == 'SOFTMAX': self.model.select_best_action(game_state, self, softmax=True) - elif self.train and random.random() < config.EPSILON: - if config.POLICY == 'GREEDY': + elif self.train and random.random() < hyper_parameters["epsilon"]: + if hyper_parameters["policy"] == 'GREEDY': self.logger.debug("Choosing action purely at random.") # 80%: walk in any direction. 10% wait. 10% bomb. - return np.random.choice(config.ACTIONS, p=[.2, .2, .2, .2, .1, .1]) - elif config.POLICY == 'IANN': + return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1]) + elif hyper_parameters["policy"] == 'IANN': self.model.select_best_action(game_state, self, softmax=True) else: self.logger.debug("Querying model for action.") diff --git a/agent_code/auto_bomber/default_hyper_parameters.json b/agent_code/auto_bomber/default_hyper_parameters.json new file mode 100644 index 000000000..46f2719c2 --- /dev/null +++ b/agent_code/auto_bomber/default_hyper_parameters.json @@ -0,0 +1,28 @@ +{ + "actions": [ + "UP", + "RIGHT", + "DOWN", + "LEFT", + "WAIT", + "BOMB" + ], + "epsilon": 0.35, + "discount": 0.95, + "learning_rate": 0.01, + "policy": "SOFTMAX", + "temperature": 0.5, + "region_size": 2, + "region_time_tolerance": 8, + "game_rewards": { + "CRATE_DESTROYED": 20, + "COIN_FOUND": 20, + "COIN_COLLECTED": 50, + "KILLED_OPPONENT": 200, + "INVALID_ACTION": -5, + "KILLED_SELF": -300, + "GOT_KILLED": -200, + "SURVIVED_ROUND": 300, + "SAME_REGION": -20 + } +} diff --git a/agent_code/auto_bomber/model.py b/agent_code/auto_bomber/model.py index 084a036e0..bbc806e6d 100644 --- a/agent_code/auto_bomber/model.py +++ b/agent_code/auto_bomber/model.py @@ -1,12 +1,21 @@ -import shutil +import json +import os import pickle +import shutil from pathlib import Path import numpy as np from tensorboardX import SummaryWriter +import agent_code.auto_bomber.model_path as model_path from agent_code.auto_bomber.transitions import Transitions -import agent_code.auto_bomber.auto_bomber_config as config + + +def get_model_dir(): + try: + return os.environ["MODEL_DIR"] + except KeyError as e: + return model_path.MODEL_DIR class LinearAutoBomberModel: @@ -15,28 +24,37 @@ def __init__(self, train, feature_extractor): self.weights = None self.feature_extractor = feature_extractor - if config.MODEL_DIR and Path(config.MODEL_DIR).is_dir(): - self.model_dir = Path(config.MODEL_DIR) - elif config.MODEL_DIR and not Path(config.MODEL_DIR).is_dir(): + model_dir = get_model_dir() + if model_dir and Path(model_dir).is_dir(): + self.model_dir = Path(model_dir) + elif model_dir and not Path(model_dir).is_dir(): raise FileNotFoundError("The specified model directory does not exist!\nIf you wish to train a NEW model" "set parameter to None, otherwise specify a valid model directory.") - elif not self.train and not config.MODEL_DIR: + elif not self.train and not model_dir: raise ValueError("No model directory has been specified.\n A model directory is required for inference.") else: - model_index = sorted([int(x.stem) for x in Path(config.MODELS_ROOT).iterdir() if x.is_dir()])[-1] + root_dir = Path(model_path.MODELS_ROOT) + root_dir.mkdir(parents=True, exist_ok=True) + existing_subdirs = sorted([int(x.stem) for x in root_dir.iterdir() if x.is_dir()]) + + model_index = existing_subdirs[-1] if existing_subdirs else -1 model_index += 1 - self.model_dir = Path(config.MODELS_ROOT) / str(model_index) + self.model_dir = Path(model_path.MODELS_ROOT) / str(model_index) self.model_dir.mkdir() + # Copy configuration file for logging purposes + shutil.copy(Path("default_hyper_parameters.json"), self.model_dir / "hyper_parameters.json") self.weights_path = self.model_dir / "weights.pt" if self.weights_path.is_file(): with self.weights_path.open(mode="rb") as file: self.weights = pickle.load(file) - if self.train: - # Copy configuration file for logging purposes - shutil.copy(Path("./auto_bomber_config.py"), self.model_dir / "config.py") + hyper_parameters_path = self.model_dir / "hyper_parameters.json" + if hyper_parameters_path.is_file(): + with hyper_parameters_path.open(mode="rb") as file: + self.hyper_parameters = json.load(file) + if self.train: self.writer = SummaryWriter(logdir=f"./runs/exp{self.model_dir.stem}") def store(self): @@ -51,17 +69,18 @@ def select_best_action(self, game_state: dict, agent_self, softmax=False): if softmax: sort_actions = q_action_values.argsort() - p = np.exp(sort_actions / config.TEMP) / np.sum(np.exp(sort_actions / config.TEMP)) + temp = self.hyper_parameters["temperature"] + p = np.exp(sort_actions / temp) / np.sum(np.exp(sort_actions / temp)) choice = np.random.choice(sort_actions, p=p) else: top_3_actions = q_action_values.argsort()[-3:][::-1] choice = np.random.choice(top_3_actions, p=[0.9, 0.05, 0.05]) - return config.ACTIONS[choice] + return self.hyper_parameters["actions"][choice] def fit_model_with_transition_batch(self, transitions: Transitions, round: int): loss = [] - numpy_transitions = transitions.to_numpy_transitions() - for action_id, action in enumerate(config.ACTIONS): + numpy_transitions = transitions.to_numpy_transitions(self.hyper_parameters) + for action_id, action in enumerate(self.hyper_parameters["actions"]): x_all_t, y_all_t = numpy_transitions.get_features_and_value_estimates(action) if x_all_t.size != 0: @@ -70,7 +89,7 @@ def fit_model_with_transition_batch(self, transitions: Transitions, round: int): loss.append(np.mean(residuals ** 2)) q_grad = np.dot(x_all_t.T, residuals) - weight_updates = config.LEARNING_RATE / y_all_t.shape[0] * q_grad + weight_updates = self.hyper_parameters["learning_rate"] / y_all_t.shape[0] * q_grad self.weights[action_id] += weight_updates mean_loss = np.mean(loss) @@ -83,4 +102,5 @@ def init_if_needed(self, features_x, agent_self): agent_self.logger.info("Model is empty init with random weights.") # Xavier weights initialization - self.weights = np.random.rand(len(config.ACTIONS), len(features_x)) * np.sqrt(1 / len(features_x)) + self.weights = np.random.rand(len(self.hyper_parameters["actions"]), + len(features_x)) * np.sqrt(1 / len(features_x)) diff --git a/agent_code/auto_bomber/model_path.py b/agent_code/auto_bomber/model_path.py new file mode 100644 index 000000000..33e12a109 --- /dev/null +++ b/agent_code/auto_bomber/model_path.py @@ -0,0 +1,2 @@ +MODELS_ROOT = "./models" +MODEL_DIR = None diff --git a/agent_code/auto_bomber/test_transitions.py b/agent_code/auto_bomber/test_transitions.py deleted file mode 100644 index 33e4abf6f..000000000 --- a/agent_code/auto_bomber/test_transitions.py +++ /dev/null @@ -1,65 +0,0 @@ -from unittest import TestCase - -import numpy as np - -import agent_code.auto_bomber.auto_bomber_config as config -from agent_code.auto_bomber.transitions import Transitions - -ARR_SIZE = 10 - - -class TestTransitions(TestCase): - def test_monte_carlo_value_estimation(self): - transitions = Transitions(lambda x: x) - transitions.add_transition(None, config.ACTIONS[0], None, 17.5) - transitions.add_transition(None, config.ACTIONS[1], None, 10.) - transitions.add_transition(None, config.ACTIONS[2], None, 20) - transitions.add_transition(None, config.ACTIONS[3], None, 40) - transitions.add_transition(None, config.ACTIONS[1], None, 80) - - numpy_trans = transitions.to_numpy_transitions() - - self.assertEqual(37.5, numpy_trans.monte_carlo_value_estimation(0)) - self.assertEqual(40, numpy_trans.monte_carlo_value_estimation(1)) - self.assertEqual(80, numpy_trans.monte_carlo_value_estimation(4)) - - def test_get_features_and_value_estimates(self): - transitions = Transitions(lambda x: x) - - transitions.add_transition(np.zeros((ARR_SIZE,)), config.ACTIONS[0], np.ones((ARR_SIZE,)), 10) - transitions.add_transition(np.ones(ARR_SIZE), config.ACTIONS[1], np.full((ARR_SIZE,), 2), 20.) - transitions.add_transition(np.full((ARR_SIZE,), 2), config.ACTIONS[0], np.full((ARR_SIZE,), 3), 40) - transitions.add_transition(np.full((ARR_SIZE,), 3), config.ACTIONS[1], np.full((ARR_SIZE,), 4), 80) - transitions.add_transition(np.full((ARR_SIZE,), 4), config.ACTIONS[0], np.full((ARR_SIZE,), 5), 160) - transitions.add_transition(np.full((ARR_SIZE,), 5), config.ACTIONS[1], np.full((ARR_SIZE,), 6), 320) - transitions.add_transition(np.full((ARR_SIZE,), 6), config.ACTIONS[0], np.full((ARR_SIZE,), 7), 640) - - numpy_trans = transitions.to_numpy_transitions() - x_0_all, y_0_all = numpy_trans.get_features_and_value_estimates(config.ACTIONS[0]) - self.assertEqual((4, 10), x_0_all.shape) - self.assertEqual((4,), y_0_all.shape) - - exp_x_0 = np.array( - [np.zeros((ARR_SIZE,)), np.full((ARR_SIZE,), 2), np.full((ARR_SIZE,), 4), np.full((ARR_SIZE,), 6)]) - np.testing.assert_array_equal(x_0_all, exp_x_0) - exp_y_0 = np.array([70, 200, 480, 640]) - np.testing.assert_array_equal(y_0_all, exp_y_0) - - x_1_all, y_1_all = numpy_trans.get_features_and_value_estimates(config.ACTIONS[1]) - self.assertEqual((3, 10), x_1_all.shape) - self.assertEqual((3,), y_1_all.shape) - - exp_x_1 = np.array([np.ones((ARR_SIZE,)), np.full((ARR_SIZE,), 3), np.full((ARR_SIZE,), 5)]) - np.testing.assert_array_equal(x_1_all, exp_x_1) - exp_y_1 = np.array([120, 320, 640]) - np.testing.assert_array_equal(y_1_all, exp_y_1) - - def test_get_features_and_value_estimates_single_action(self): - transitions = Transitions(lambda x: x) - - transitions.add_transition(np.zeros((ARR_SIZE,)), config.ACTIONS[0], np.ones((ARR_SIZE,)), 10) - numpy_trans = transitions.to_numpy_transitions() - - x_0_all, y_0_all = numpy_trans.get_features_and_value_estimates(config.ACTIONS[0]) - self.assertEqual((1, 10), x_0_all.shape) - self.assertEqual((1,), y_0_all.shape) diff --git a/agent_code/auto_bomber/train.py b/agent_code/auto_bomber/train.py index 8b2ef551d..eb314ccb7 100644 --- a/agent_code/auto_bomber/train.py +++ b/agent_code/auto_bomber/train.py @@ -1,14 +1,10 @@ -import numpy as np -from collections import namedtuple, defaultdict +from queue import Queue from typing import List -from agent_code.auto_bomber.feature_engineering import state_to_features from agent_code.auto_bomber import custom_events as ce - +from agent_code.auto_bomber.feature_engineering import state_to_features # This is only an example! from agent_code.auto_bomber.transitions import Transitions -import agent_code.auto_bomber.auto_bomber_config as config -from queue import Queue def setup_training(self): @@ -21,7 +17,9 @@ def setup_training(self): """ # Example: Setup an array that will note transition tuples self.transitions = Transitions(state_to_features) - self.q = Queue(maxsize=config.REGION_TIME_TOLERANCE) + + self.q = Queue(maxsize=self.model.hyper_parameters["region_time_tolerance"]) + def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_state: dict, events: List[str]): """ @@ -42,15 +40,15 @@ def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_ :param events: The events that occurred when going from `old_game_state` to `new_game_state` """ self.logger.debug(f'Encountered game event(s) {", ".join(map(repr, events))} in step {new_game_state["step"]}') - # state_to_features is defined in callbacks.py self.transitions.add_transition(old_game_state, last_action, new_game_state, reward_from_events(self, events)) # Punishment, if agent is still in the same radius after certain time steps new_position = new_game_state["self"][3] + region_size = self.model.hyper_parameters["region_size"] if self.q.full(): old_position = self.q.get() - if (old_position[0] - config.REGION_SIZE <= new_position[0] <= old_position[0] + config.REGION_SIZE) \ - or (old_position[1] - config.REGION_SIZE <= new_position[1] <= old_position[1] + config.REGION_SIZE): + if (old_position[0] - region_size <= new_position[0] <= old_position[0] + region_size) \ + or (old_position[1] - region_size <= new_position[1] <= old_position[1] + region_size): events.append(ce.SAME_REGION) self.q.put(new_position) @@ -87,9 +85,11 @@ def reward_from_events(self, events: List[str]) -> int: certain behavior. """ # q: how to determine the winner? + + rewards_dict = self.model.hyper_parameters["game_rewards"] reward_sum = 0 for event in events: - if event in config.game_rewards: - reward_sum += config.game_rewards[event] + if event in rewards_dict: + reward_sum += rewards_dict[event] self.logger.info(f"Awarded {reward_sum} for events {', '.join(events)}") return reward_sum diff --git a/agent_code/auto_bomber/transitions.py b/agent_code/auto_bomber/transitions.py index 88b31183e..4cbb60422 100644 --- a/agent_code/auto_bomber/transitions.py +++ b/agent_code/auto_bomber/transitions.py @@ -1,5 +1,4 @@ import numpy as np -import agent_code.auto_bomber.auto_bomber_config as config class Transitions: @@ -17,8 +16,8 @@ def add_transition(self, old_game_state, action, new_game_state, rewards): self.next_states.append(self.feature_extractor(new_game_state)) self.rewards.append(rewards) - def to_numpy_transitions(self): - return NumpyTransitions(self) + def to_numpy_transitions(self, hyper_parameters): + return NumpyTransitions(self, hyper_parameters) def clear(self): self.states.clear() @@ -29,11 +28,12 @@ def clear(self): class NumpyTransitions: # todo add hyperparam for batch size to support TD-n-step and monte-carlo - def __init__(self, transitions): + def __init__(self, transitions, hyper_parameters): self.states = np.asarray(transitions.states, dtype=np.float32) self.actions = np.asarray(transitions.actions) self.next_states = np.asarray(transitions.next_states, dtype=np.float32) self.rewards = np.asarray(transitions.rewards, dtype=np.float32) + self.hyper_parameters = hyper_parameters def get_time_steps_for_action(self, action): return np.argwhere(self.actions == action) @@ -47,6 +47,6 @@ def get_features_and_value_estimates(self, action): def monte_carlo_value_estimation(self, time_step_start: int): relevant_rewards = self.rewards[time_step_start:] - discounts = np.fromfunction(lambda i: config.DISCOUNT ** i, shape=(len(relevant_rewards),), dtype=np.float32) + discounts = np.fromfunction(lambda i: self.hyper_parameters["discount"] ** i, + shape=(len(relevant_rewards),), dtype=np.float32) return np.sum(discounts * relevant_rewards) - diff --git a/train_scripts/coins_only.sh b/train_scripts/coins_only.sh new file mode 100755 index 000000000..9499363c9 --- /dev/null +++ b/train_scripts/coins_only.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +python3 main.py play --agents auto_bomber --train 1 --n-rounds 100000 --no-gui