Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fb_misc
Browse files Browse the repository at this point in the history
# Conflicts:
#	agent_code/auto_bomber/auto_bomber_config.py
#	agent_code/auto_bomber/callbacks.py
#	agent_code/auto_bomber/model.py
  • Loading branch information
tkrieger committed Mar 24, 2021
2 parents 2cf546c + 4ef98a2 commit fa98e30
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 119 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,4 @@ replays/
agent_code/auto_bomber/models
agent_code/auto_bomber/*.pt
agent_code/auto_bomber/runs
.DS_Store
.DS_Store
28 changes: 28 additions & 0 deletions agent_code/auto_bomber/auto_bomber_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import events as e
from agent_code.auto_bomber import custom_events as ce


MODELS_ROOT = "./models"
# MODEL_DIR = "./models/41"
MODEL_DIR = None
ACTIONS = ['UP', 'RIGHT', 'DOWN', 'LEFT', 'WAIT', 'BOMB']
EPSILON = 0.25
DISCOUNT = 0.5
LEARNING_RATE = 0.1
POLICY = 'IANN'
TEMPERATURE = 0.5
REGION_SIZE = 2
REGION_TIME_TOLERANCE = 8

game_rewards = {
e.CRATE_DESTROYED: 10,
e.BOMB_DROPPED: 20,
e.COIN_FOUND: 10,
e.COIN_COLLECTED: 50,
e.KILLED_OPPONENT: 200,
e.INVALID_ACTION: -1,
e.KILLED_SELF: -300,
e.GOT_KILLED: -200,
e.SURVIVED_ROUND: 300,
ce.SAME_REGION: -20
}
15 changes: 9 additions & 6 deletions agent_code/auto_bomber/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from agent_code.auto_bomber.feature_engineering import state_to_features
from agent_code.auto_bomber.model import LinearAutoBomberModel


def setup(self):
"""
Setup your code. This is called once when loading each agent.
Expand Down Expand Up @@ -34,11 +33,15 @@ def act(self, game_state: dict) -> str:
"""

hyper_parameters = self.model.hyper_parameters
# todo right now epsilon-greedy - change to softmax to avoid local maxima
if self.train and random.random() < hyper_parameters["epsilon"]:
self.logger.debug("Choosing action purely at random.")
# 80%: walk in any direction. 10% wait. 10% bomb.
return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1])
if self.train and config.POLICY == 'SOFTMAX':
self.model.select_best_action(game_state, self, softmax=True)
elif self.train and random.random() < hyper_parameters["epsilon"]:
if hyper_parameters["policy"]: == 'GREEDY':
self.logger.debug("Choosing action purely at random.")
# 80%: walk in any direction. 10% wait. 10% bomb.
return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1])
elif hyper_parameters["policy"] == 'IANN':
self.model.select_best_action(game_state, self, softmax=True)
else:
self.logger.debug("Querying model for action.")
return self.model.select_best_action(game_state, self)
1 change: 1 addition & 0 deletions agent_code/auto_bomber/custom_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SAME_REGION = 'SAME_REGION'
92 changes: 58 additions & 34 deletions agent_code/auto_bomber/feature_engineering.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np

from agent_code.auto_bomber.utils import softmax
from scipy.special import softmax


def state_to_features(game_state: dict, weight_opponents_no_bomb=0.0) -> np.array:
Expand All @@ -17,15 +16,22 @@ def state_to_features(game_state: dict, weight_opponents_no_bomb=0.0) -> np.arra
without BOMB action available
:return: np.array
"""
#############
# NOTES #
#############
# Coins zones signal very weak! -> Used softmax, which keeps 0.0 by using -np.inf
# Add coins to crates --> not good, need to know where crates are, are distinct from coins as need to be exploded

# This is the dict before the game begins and after it ends
if game_state is None:
# todo we need another representation for final state here!
return np.random.rand(4)
return np.random.rand(13)

field_width, field_height = game_state['field'].shape
assert field_width == field_height, "Field is not rectangular, some assumptions do not hold. Abort!"

agent_position = np.asarray(game_state['self'][3], dtype='int')
agent_bomb_action = np.asarray(game_state['self'][2], dtype='int')
bombs_position = np.asarray([list(bomb[0]) for bomb in game_state['bombs']], dtype='int')
bombs_countdown = np.asarray([bomb[1] for bomb in game_state['bombs']])
explosions_position = np.argwhere(game_state['explosion_map'] > 0)
Expand All @@ -36,50 +42,68 @@ def state_to_features(game_state: dict, weight_opponents_no_bomb=0.0) -> np.arra
opponents_bomb_action = np.where(opponents_bomb_action, 1.0, weight_opponents_no_bomb)
walls_position = np.argwhere(game_state['field'] == -1)

# TODO Evaluate normalization/scaling
bomb_danger_zones = _compute_zones_heatmap(agent_position, bombs_position, 1.0,
# lambda v, w: np.divide(1, v * w, out=np.ones_like(v), where=(v*w) != 0),
lambda v, w: v * w,
bombs_countdown,
# lambda v: v / np.max(v)
lambda v: np.sum(v),
lambda v: np.divide(v, np.max(v), out=np.zeros_like(v), where=v != 0))
# TODO Coins zones signal very weak! -> Used softmax, which keeps 0.0 by using -np.inf
# TODO HUUUUUUUUUGE!!!!!!! --> Switch distances from euclidean to a path finding algorithm
# https://pypi.org/project/pathfinding/

# TODO Make BOMB_POWER dynamic from settings.py
bombs_zones = _compute_zones_heatmap(agent_position, bombs_position, 0.0,
lambda v, w: np.where(v > 0., v[(3 + w) - v >= 0] ** w[(3 + w) - v >= 0], 0.0),
bombs_countdown,
lambda v: np.mean(v) if v.size != 0 else 0.0,
lambda v: -1 * np.divide(1, v, out=np.zeros_like(v), where=v != 0))

# TODO Does not account for how many coins there are in the zone
coins_zones = _compute_zones_heatmap(agent_position, coins_position, 0.0,
aggregation_func=lambda v: np.mean(v) if v.size != 0 else 0.0,
normalization_func=lambda v: softmax(np.divide(1, v, out=np.full_like(v, -np.inf), where=v != 0))) # v / np.max(v))
crates_zones = _compute_zones_heatmap(agent_position, crates_position, 0.0, aggregation_func=lambda v: np.mean(v),
normalization_func=lambda v: np.divide(1, v, out=np.zeros_like(v), where=v != 0)) # v / np.max(v))
normalization_func=lambda v: softmax(
np.divide(1, v, out=np.full_like(v, -np.inf), where=v != 0)) if np.all(
v != 0.0) else v)
crates_zones = _compute_zones_heatmap(agent_position, crates_position, 0.0,
aggregation_func=lambda v: np.mean(v) if v.size != 0 else 0.0,
normalization_func=lambda v: softmax(
np.divide(1, v, out=np.full_like(v, -np.inf), where=v != 0)))
opponents_zones = _compute_zones_heatmap(agent_position, opponents_position, 0.0, lambda v, w: v * w,
opponents_bomb_action,
lambda v: np.sum(v),
lambda v: np.divide(v, np.max(v), out=np.zeros_like(v), where=v != 0))

explosion_field_of_view = _object_in_field_of_view(agent_position, explosions_position, -1., lambda v, w: v / w,
field_width)
# TODO Evaluate if weighting bombs also here by their countdown
# TODO Exclude bombs which are not relevant (!!!!)
bombs_field_of_view = _object_in_field_of_view(agent_position, explosions_position, 0.0,
lambda v, w: -1 * np.divide(1, v, out=np.zeros_like(v),
where=v != 0),
None)
explosion_field_of_view = _object_in_field_of_view(agent_position, explosions_position, 1.0,
lambda v, w: np.where(v == 1.0, 0.0, 1.0), None)
coins_field_of_view = _object_in_field_of_view(agent_position, coins_position, 0.0,
lambda v, w: np.divide(1, v, out=np.zeros_like(v), where=v != 0),
None)
crates_field_of_view = _object_in_field_of_view(agent_position, crates_position, -1., lambda v, w: v / w, field_width)
# walls_field_of_view = _object_in_field_of_view(agent_position, walls_position, lambda v, w: v / w, field_width)
walls_field_of_view = _object_in_field_of_view(agent_position, walls_position, 0.0,
crates_field_of_view = _object_in_field_of_view(agent_position, crates_position, 0.0,
lambda v, w: np.divide(1, v, out=np.zeros_like(v), where=v != 0),
None)
walls_field_of_view = _object_in_field_of_view(agent_position, walls_position, 1.0,
lambda v, w: np.where(v == 1.0, 0.0, 1.0), None)

# TODO Set auxiliary reward for moving away from a danger zone
# TODO Negative reward for staying multiple steps in same position
# TODO Negative reward repetition of moves

# return np.concatenate((bomb_danger_zones, coins_zones, crates_zones, opponents_zones,
# explosion_field_of_view, coins_field_of_view, crates_field_of_view,
# walls_field_of_view), axis=None)
features = softmax(np.sum(np.concatenate((coins_zones, coins_field_of_view), axis=None).reshape(2, 4), axis=0))
# return np.concatenate((coins_zones, coins_field_of_view, walls_field_of_view), axis=None)
# return np.concatenate((coins_zones, coins_field_of_view), axis=None)
# return np.concatenate((bomb_danger_zones, coins_zones, crates_zones, opponents_zones), axis=None)
# return np.concatenate((coins_field_of_view, walls_field_of_view), axis=None)

features[walls_field_of_view == 0.] = -1.0
f_bombs = np.sum(np.vstack((bombs_zones, bombs_field_of_view)), axis=0)
if not np.all((f_bombs == 0.0)):
f_bombs = np.where(f_bombs == 0.0, np.inf, f_bombs)
f_bombs = -1 * softmax(-1 * f_bombs)

f_coins = np.sum(np.vstack((coins_zones, coins_field_of_view)), axis=0)
if not np.all((f_coins == 0.0)):
f_coins = np.where(f_coins == 0.0, -np.inf, f_coins)
f_coins = softmax(f_coins)
f_coins[walls_field_of_view == 0.] = -1.0
f_coins[explosion_field_of_view == 0.] = -1.0

f_crates = np.sum(np.vstack((crates_zones, crates_field_of_view)), axis=0)
if not np.all((f_crates == 0.0)):
f_crates = np.where(f_crates == 0.0, -np.inf, f_crates)
f_crates = softmax(f_crates)
f_crates[walls_field_of_view == 0.] = -1.0
f_crates[explosion_field_of_view == 0.] = -1.0

features = np.concatenate((f_coins, f_crates, f_bombs, agent_bomb_action), axis=None)

return features

Expand Down
15 changes: 9 additions & 6 deletions agent_code/auto_bomber/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def __init__(self, train, feature_extractor):
elif model_path.MODEL_DIR and not Path(model_path.MODEL_DIR).is_dir():
raise FileNotFoundError("The specified model directory does not exist!\nIf you wish to train a NEW model"
"set parameter to None, otherwise specify a valid model directory.")
elif not self.train and not model_path.MODEL_DIR:
raise ValueError("No model directory has been specified.\n A model directory is required for inference.")
else:
root_dir = Path(model_path.MODELS_ROOT)
root_dir.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -50,19 +52,20 @@ def store(self):
with self.weights_path.open(mode="wb") as file:
pickle.dump(self.weights, file)

def select_best_action(self, game_state: dict, agent_self):
def select_best_action(self, game_state: dict, agent_self, softmax=False):
features_x = self.feature_extractor(game_state)
self.init_if_needed(features_x, agent_self)

q_action_values = np.dot(self.weights, features_x)

if self.hyper_parameters["top_3_rand"]:
if softmax:
sort_actions = q_action_values.argsort()
p = np.exp(sort_actions / self.hyper_parameters["temp"]) / np.sum(np.exp(sort_actions / self.hyper_parameters["temp"]))
choice = np.random.choice(sort_actions, p=p)
else:
top_3_actions = q_action_values.argsort()[-3:][::-1]
# lets keep a little bit randomness here
choice = np.random.choice(top_3_actions, p=[0.9, 0.05, 0.05])
return self.hyper_parameters["actions"][choice]
else:
return np.argmax(q_action_values)
return self.hyper_parameters["actions"][choice]

def fit_model_with_transition_batch(self, transitions: Transitions, round: int):
loss = []
Expand Down
2 changes: 1 addition & 1 deletion agent_code/auto_bomber/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ pygame==2.0.1
tqdm==4.58.0
tensorboardX==2.1
tensorboard==2.4.1

scipy==1.6.1
38 changes: 14 additions & 24 deletions agent_code/auto_bomber/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
from collections import namedtuple, defaultdict
from typing import List

import events as e
from agent_code.auto_bomber.feature_engineering import state_to_features
from agent_code.auto_bomber import custom_events as ce

# This is only an example!
from agent_code.auto_bomber.transitions import Transitions
import agent_code.auto_bomber.auto_bomber_config as config
from queue import Queue


def setup_training(self):
Expand All @@ -19,7 +21,7 @@ def setup_training(self):
"""
# Example: Setup an array that will note transition tuples
self.transitions = Transitions(state_to_features)

self.q = Queue(maxsize=config.REGION_TIME_TOLERANCE)

def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_state: dict, events: List[str]):
"""
Expand All @@ -43,6 +45,14 @@ def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_

# state_to_features is defined in callbacks.py
self.transitions.add_transition(old_game_state, last_action, new_game_state, reward_from_events(self, events))
# Punishment, if agent is still in the same radius after certain time steps
new_position = new_game_state["self"][3]
if self.q.full():
old_position = self.q.get()
if (old_position[0] - config.REGION_SIZE <= new_position[0] <= old_position[0] + config.REGION_SIZE) \
or (old_position[1] - config.REGION_SIZE <= new_position[1] <= old_position[1] + config.REGION_SIZE):
events.append(ce.SAME_REGION)
self.q.put(new_position)


def end_of_round(self, last_game_state: dict, last_action: str, events: List[str]):
Expand Down Expand Up @@ -76,31 +86,11 @@ def reward_from_events(self, events: List[str]) -> int:
Here you can modify the rewards your agent get so as to en/discourage
certain behavior.
"""
# todo reward definition (right now only first sketch):
# q: how to determine the winner?
game_rewards = {
e.COIN_COLLECTED: 100,
e.KILLED_OPPONENT: 50,
e.INVALID_ACTION: -100,
e.KILLED_SELF: -300,
e.GOT_KILLED: -50,
e.WAITED: -10,
e.SURVIVED_ROUND: 5
}

# game_rewards = {
# e.COIN_COLLECTED: 20,
# e.KILLED_OPPONENT: 40,
# e.INVALID_ACTION: -10,
# e.KILLED_SELF: -50,
# e.GOT_KILLED: -30,
# e.WAITED: -5,
# e.SURVIVED_ROUND: -1
# }

reward_sum = 0
for event in events:
if event in game_rewards:
reward_sum += game_rewards[event]
if event in config.game_rewards:
reward_sum += config.game_rewards[event]
self.logger.info(f"Awarded {reward_sum} for events {', '.join(events)}")
return reward_sum
46 changes: 0 additions & 46 deletions agent_code/auto_bomber/utils.py

This file was deleted.

2 changes: 1 addition & 1 deletion settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Game properties
COLS = 17
ROWS = 17
CRATE_DENSITY = 0.0 # 0.75
CRATE_DENSITY = 0.0 # 0.25 # 0.75
MAX_AGENTS = 4

# Round properties
Expand Down

0 comments on commit fa98e30

Please sign in to comment.