diff --git a/.gitignore b/.gitignore index 31f717221..e3f06f830 100644 --- a/.gitignore +++ b/.gitignore @@ -114,4 +114,4 @@ replays/ agent_code/auto_bomber/models agent_code/auto_bomber/*.pt agent_code/auto_bomber/runs -.DS_Store \ No newline at end of file +.DS_Store diff --git a/agent_code/auto_bomber/auto_bomber_config.py b/agent_code/auto_bomber/auto_bomber_config.py new file mode 100644 index 000000000..efd547fc0 --- /dev/null +++ b/agent_code/auto_bomber/auto_bomber_config.py @@ -0,0 +1,28 @@ +import events as e +from agent_code.auto_bomber import custom_events as ce + + +MODELS_ROOT = "./models" +# MODEL_DIR = "./models/41" +MODEL_DIR = None +ACTIONS = ['UP', 'RIGHT', 'DOWN', 'LEFT', 'WAIT', 'BOMB'] +EPSILON = 0.25 +DISCOUNT = 0.5 +LEARNING_RATE = 0.1 +POLICY = 'IANN' +TEMPERATURE = 0.5 +REGION_SIZE = 2 +REGION_TIME_TOLERANCE = 8 + +game_rewards = { + e.CRATE_DESTROYED: 10, + e.BOMB_DROPPED: 20, + e.COIN_FOUND: 10, + e.COIN_COLLECTED: 50, + e.KILLED_OPPONENT: 200, + e.INVALID_ACTION: -1, + e.KILLED_SELF: -300, + e.GOT_KILLED: -200, + e.SURVIVED_ROUND: 300, + ce.SAME_REGION: -20 +} diff --git a/agent_code/auto_bomber/callbacks.py b/agent_code/auto_bomber/callbacks.py index 7edf25619..4ebf44f29 100644 --- a/agent_code/auto_bomber/callbacks.py +++ b/agent_code/auto_bomber/callbacks.py @@ -5,7 +5,6 @@ from agent_code.auto_bomber.feature_engineering import state_to_features from agent_code.auto_bomber.model import LinearAutoBomberModel - def setup(self): """ Setup your code. This is called once when loading each agent. @@ -34,11 +33,15 @@ def act(self, game_state: dict) -> str: """ hyper_parameters = self.model.hyper_parameters - # todo right now epsilon-greedy - change to softmax to avoid local maxima - if self.train and random.random() < hyper_parameters["epsilon"]: - self.logger.debug("Choosing action purely at random.") - # 80%: walk in any direction. 10% wait. 10% bomb. - return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1]) + if self.train and config.POLICY == 'SOFTMAX': + self.model.select_best_action(game_state, self, softmax=True) + elif self.train and random.random() < hyper_parameters["epsilon"]: + if hyper_parameters["policy"]: == 'GREEDY': + self.logger.debug("Choosing action purely at random.") + # 80%: walk in any direction. 10% wait. 10% bomb. + return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1]) + elif hyper_parameters["policy"] == 'IANN': + self.model.select_best_action(game_state, self, softmax=True) else: self.logger.debug("Querying model for action.") return self.model.select_best_action(game_state, self) diff --git a/agent_code/auto_bomber/custom_events.py b/agent_code/auto_bomber/custom_events.py new file mode 100644 index 000000000..588cdb43d --- /dev/null +++ b/agent_code/auto_bomber/custom_events.py @@ -0,0 +1 @@ +SAME_REGION = 'SAME_REGION' \ No newline at end of file diff --git a/agent_code/auto_bomber/feature_engineering.py b/agent_code/auto_bomber/feature_engineering.py index 9c39f9695..09f1d44ee 100644 --- a/agent_code/auto_bomber/feature_engineering.py +++ b/agent_code/auto_bomber/feature_engineering.py @@ -1,6 +1,5 @@ import numpy as np - -from agent_code.auto_bomber.utils import softmax +from scipy.special import softmax def state_to_features(game_state: dict, weight_opponents_no_bomb=0.0) -> np.array: @@ -17,15 +16,22 @@ def state_to_features(game_state: dict, weight_opponents_no_bomb=0.0) -> np.arra without BOMB action available :return: np.array """ + ############# + # NOTES # + ############# + # Coins zones signal very weak! -> Used softmax, which keeps 0.0 by using -np.inf + # Add coins to crates --> not good, need to know where crates are, are distinct from coins as need to be exploded + # This is the dict before the game begins and after it ends if game_state is None: # todo we need another representation for final state here! - return np.random.rand(4) + return np.random.rand(13) field_width, field_height = game_state['field'].shape assert field_width == field_height, "Field is not rectangular, some assumptions do not hold. Abort!" agent_position = np.asarray(game_state['self'][3], dtype='int') + agent_bomb_action = np.asarray(game_state['self'][2], dtype='int') bombs_position = np.asarray([list(bomb[0]) for bomb in game_state['bombs']], dtype='int') bombs_countdown = np.asarray([bomb[1] for bomb in game_state['bombs']]) explosions_position = np.argwhere(game_state['explosion_map'] > 0) @@ -36,50 +42,68 @@ def state_to_features(game_state: dict, weight_opponents_no_bomb=0.0) -> np.arra opponents_bomb_action = np.where(opponents_bomb_action, 1.0, weight_opponents_no_bomb) walls_position = np.argwhere(game_state['field'] == -1) - # TODO Evaluate normalization/scaling - bomb_danger_zones = _compute_zones_heatmap(agent_position, bombs_position, 1.0, - # lambda v, w: np.divide(1, v * w, out=np.ones_like(v), where=(v*w) != 0), - lambda v, w: v * w, - bombs_countdown, - # lambda v: v / np.max(v) - lambda v: np.sum(v), - lambda v: np.divide(v, np.max(v), out=np.zeros_like(v), where=v != 0)) - # TODO Coins zones signal very weak! -> Used softmax, which keeps 0.0 by using -np.inf + # TODO HUUUUUUUUUGE!!!!!!! --> Switch distances from euclidean to a path finding algorithm + # https://pypi.org/project/pathfinding/ + + # TODO Make BOMB_POWER dynamic from settings.py + bombs_zones = _compute_zones_heatmap(agent_position, bombs_position, 0.0, + lambda v, w: np.where(v > 0., v[(3 + w) - v >= 0] ** w[(3 + w) - v >= 0], 0.0), + bombs_countdown, + lambda v: np.mean(v) if v.size != 0 else 0.0, + lambda v: -1 * np.divide(1, v, out=np.zeros_like(v), where=v != 0)) + # TODO Does not account for how many coins there are in the zone coins_zones = _compute_zones_heatmap(agent_position, coins_position, 0.0, aggregation_func=lambda v: np.mean(v) if v.size != 0 else 0.0, - normalization_func=lambda v: softmax(np.divide(1, v, out=np.full_like(v, -np.inf), where=v != 0))) # v / np.max(v)) - crates_zones = _compute_zones_heatmap(agent_position, crates_position, 0.0, aggregation_func=lambda v: np.mean(v), - normalization_func=lambda v: np.divide(1, v, out=np.zeros_like(v), where=v != 0)) # v / np.max(v)) + normalization_func=lambda v: softmax( + np.divide(1, v, out=np.full_like(v, -np.inf), where=v != 0)) if np.all( + v != 0.0) else v) + crates_zones = _compute_zones_heatmap(agent_position, crates_position, 0.0, + aggregation_func=lambda v: np.mean(v) if v.size != 0 else 0.0, + normalization_func=lambda v: softmax( + np.divide(1, v, out=np.full_like(v, -np.inf), where=v != 0))) opponents_zones = _compute_zones_heatmap(agent_position, opponents_position, 0.0, lambda v, w: v * w, opponents_bomb_action, lambda v: np.sum(v), lambda v: np.divide(v, np.max(v), out=np.zeros_like(v), where=v != 0)) - explosion_field_of_view = _object_in_field_of_view(agent_position, explosions_position, -1., lambda v, w: v / w, - field_width) + # TODO Evaluate if weighting bombs also here by their countdown + # TODO Exclude bombs which are not relevant (!!!!) + bombs_field_of_view = _object_in_field_of_view(agent_position, explosions_position, 0.0, + lambda v, w: -1 * np.divide(1, v, out=np.zeros_like(v), + where=v != 0), + None) + explosion_field_of_view = _object_in_field_of_view(agent_position, explosions_position, 1.0, + lambda v, w: np.where(v == 1.0, 0.0, 1.0), None) coins_field_of_view = _object_in_field_of_view(agent_position, coins_position, 0.0, lambda v, w: np.divide(1, v, out=np.zeros_like(v), where=v != 0), None) - crates_field_of_view = _object_in_field_of_view(agent_position, crates_position, -1., lambda v, w: v / w, field_width) - # walls_field_of_view = _object_in_field_of_view(agent_position, walls_position, lambda v, w: v / w, field_width) - walls_field_of_view = _object_in_field_of_view(agent_position, walls_position, 0.0, + crates_field_of_view = _object_in_field_of_view(agent_position, crates_position, 0.0, + lambda v, w: np.divide(1, v, out=np.zeros_like(v), where=v != 0), + None) + walls_field_of_view = _object_in_field_of_view(agent_position, walls_position, 1.0, lambda v, w: np.where(v == 1.0, 0.0, 1.0), None) - # TODO Set auxiliary reward for moving away from a danger zone - # TODO Negative reward for staying multiple steps in same position - # TODO Negative reward repetition of moves - - # return np.concatenate((bomb_danger_zones, coins_zones, crates_zones, opponents_zones, - # explosion_field_of_view, coins_field_of_view, crates_field_of_view, - # walls_field_of_view), axis=None) - features = softmax(np.sum(np.concatenate((coins_zones, coins_field_of_view), axis=None).reshape(2, 4), axis=0)) - # return np.concatenate((coins_zones, coins_field_of_view, walls_field_of_view), axis=None) - # return np.concatenate((coins_zones, coins_field_of_view), axis=None) - # return np.concatenate((bomb_danger_zones, coins_zones, crates_zones, opponents_zones), axis=None) - # return np.concatenate((coins_field_of_view, walls_field_of_view), axis=None) - - features[walls_field_of_view == 0.] = -1.0 + f_bombs = np.sum(np.vstack((bombs_zones, bombs_field_of_view)), axis=0) + if not np.all((f_bombs == 0.0)): + f_bombs = np.where(f_bombs == 0.0, np.inf, f_bombs) + f_bombs = -1 * softmax(-1 * f_bombs) + + f_coins = np.sum(np.vstack((coins_zones, coins_field_of_view)), axis=0) + if not np.all((f_coins == 0.0)): + f_coins = np.where(f_coins == 0.0, -np.inf, f_coins) + f_coins = softmax(f_coins) + f_coins[walls_field_of_view == 0.] = -1.0 + f_coins[explosion_field_of_view == 0.] = -1.0 + + f_crates = np.sum(np.vstack((crates_zones, crates_field_of_view)), axis=0) + if not np.all((f_crates == 0.0)): + f_crates = np.where(f_crates == 0.0, -np.inf, f_crates) + f_crates = softmax(f_crates) + f_crates[walls_field_of_view == 0.] = -1.0 + f_crates[explosion_field_of_view == 0.] = -1.0 + + features = np.concatenate((f_coins, f_crates, f_bombs, agent_bomb_action), axis=None) return features diff --git a/agent_code/auto_bomber/model.py b/agent_code/auto_bomber/model.py index 32c5f58cb..b0f2d65e9 100644 --- a/agent_code/auto_bomber/model.py +++ b/agent_code/auto_bomber/model.py @@ -21,6 +21,8 @@ def __init__(self, train, feature_extractor): elif model_path.MODEL_DIR and not Path(model_path.MODEL_DIR).is_dir(): raise FileNotFoundError("The specified model directory does not exist!\nIf you wish to train a NEW model" "set parameter to None, otherwise specify a valid model directory.") + elif not self.train and not model_path.MODEL_DIR: + raise ValueError("No model directory has been specified.\n A model directory is required for inference.") else: root_dir = Path(model_path.MODELS_ROOT) root_dir.mkdir(parents=True, exist_ok=True) @@ -50,19 +52,20 @@ def store(self): with self.weights_path.open(mode="wb") as file: pickle.dump(self.weights, file) - def select_best_action(self, game_state: dict, agent_self): + def select_best_action(self, game_state: dict, agent_self, softmax=False): features_x = self.feature_extractor(game_state) self.init_if_needed(features_x, agent_self) q_action_values = np.dot(self.weights, features_x) - if self.hyper_parameters["top_3_rand"]: + if softmax: + sort_actions = q_action_values.argsort() + p = np.exp(sort_actions / self.hyper_parameters["temp"]) / np.sum(np.exp(sort_actions / self.hyper_parameters["temp"])) + choice = np.random.choice(sort_actions, p=p) + else: top_3_actions = q_action_values.argsort()[-3:][::-1] - # lets keep a little bit randomness here choice = np.random.choice(top_3_actions, p=[0.9, 0.05, 0.05]) - return self.hyper_parameters["actions"][choice] - else: - return np.argmax(q_action_values) + return self.hyper_parameters["actions"][choice] def fit_model_with_transition_batch(self, transitions: Transitions, round: int): loss = [] diff --git a/agent_code/auto_bomber/requirements.txt b/agent_code/auto_bomber/requirements.txt index 59e1163a5..74d41f6fc 100644 --- a/agent_code/auto_bomber/requirements.txt +++ b/agent_code/auto_bomber/requirements.txt @@ -3,4 +3,4 @@ pygame==2.0.1 tqdm==4.58.0 tensorboardX==2.1 tensorboard==2.4.1 - +scipy==1.6.1 \ No newline at end of file diff --git a/agent_code/auto_bomber/train.py b/agent_code/auto_bomber/train.py index 7a5420b59..0b53b5375 100644 --- a/agent_code/auto_bomber/train.py +++ b/agent_code/auto_bomber/train.py @@ -2,11 +2,13 @@ from collections import namedtuple, defaultdict from typing import List -import events as e from agent_code.auto_bomber.feature_engineering import state_to_features +from agent_code.auto_bomber import custom_events as ce # This is only an example! from agent_code.auto_bomber.transitions import Transitions +import agent_code.auto_bomber.auto_bomber_config as config +from queue import Queue def setup_training(self): @@ -19,7 +21,7 @@ def setup_training(self): """ # Example: Setup an array that will note transition tuples self.transitions = Transitions(state_to_features) - + self.q = Queue(maxsize=config.REGION_TIME_TOLERANCE) def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_state: dict, events: List[str]): """ @@ -43,6 +45,14 @@ def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_ # state_to_features is defined in callbacks.py self.transitions.add_transition(old_game_state, last_action, new_game_state, reward_from_events(self, events)) + # Punishment, if agent is still in the same radius after certain time steps + new_position = new_game_state["self"][3] + if self.q.full(): + old_position = self.q.get() + if (old_position[0] - config.REGION_SIZE <= new_position[0] <= old_position[0] + config.REGION_SIZE) \ + or (old_position[1] - config.REGION_SIZE <= new_position[1] <= old_position[1] + config.REGION_SIZE): + events.append(ce.SAME_REGION) + self.q.put(new_position) def end_of_round(self, last_game_state: dict, last_action: str, events: List[str]): @@ -76,31 +86,11 @@ def reward_from_events(self, events: List[str]) -> int: Here you can modify the rewards your agent get so as to en/discourage certain behavior. """ - # todo reward definition (right now only first sketch): # q: how to determine the winner? - game_rewards = { - e.COIN_COLLECTED: 100, - e.KILLED_OPPONENT: 50, - e.INVALID_ACTION: -100, - e.KILLED_SELF: -300, - e.GOT_KILLED: -50, - e.WAITED: -10, - e.SURVIVED_ROUND: 5 - } - - # game_rewards = { - # e.COIN_COLLECTED: 20, - # e.KILLED_OPPONENT: 40, - # e.INVALID_ACTION: -10, - # e.KILLED_SELF: -50, - # e.GOT_KILLED: -30, - # e.WAITED: -5, - # e.SURVIVED_ROUND: -1 - # } reward_sum = 0 for event in events: - if event in game_rewards: - reward_sum += game_rewards[event] + if event in config.game_rewards: + reward_sum += config.game_rewards[event] self.logger.info(f"Awarded {reward_sum} for events {', '.join(events)}") return reward_sum diff --git a/agent_code/auto_bomber/utils.py b/agent_code/auto_bomber/utils.py deleted file mode 100644 index 28034a55c..000000000 --- a/agent_code/auto_bomber/utils.py +++ /dev/null @@ -1,46 +0,0 @@ -import numpy as np - - -def softmax(X, theta=1.0, axis=None): - """ - Compute the softmax of each element along an axis of X. - - Parameters - ---------- - X: ND-Array. Probably should be floats. - theta (optional): float parameter, used as a multiplier - prior to exponentiation. Default = 1.0 - axis (optional): axis to compute values along. Default is the - first non-singleton axis. - - Returns an array the same size as X. The result will sum to 1 - along the specified axis. - """ - - # make X at least 2d - y = np.atleast_2d(X) - - # find axis - if axis is None: - axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) - - # multiply y against the theta parameter, - y = y * float(theta) - - # subtract the max for numerical stability - y = y - np.expand_dims(np.max(y, axis=axis), axis) - - # exponentiate y - y = np.exp(y) - - # take the sum along the specified axis - ax_sum = np.expand_dims(np.sum(y, axis=axis), axis) - - # finally: divide elementwise - p = y / ax_sum - - # flatten if X was 1D - if len(X.shape) == 1: - p = p.flatten() - - return p diff --git a/settings.py b/settings.py index 8ff09938b..00a1ac950 100644 --- a/settings.py +++ b/settings.py @@ -5,7 +5,7 @@ # Game properties COLS = 17 ROWS = 17 -CRATE_DENSITY = 0.0 # 0.75 +CRATE_DENSITY = 0.0 # 0.25 # 0.75 MAX_AGENTS = 4 # Round properties