Skip to content

Commit

Permalink
Merge branch 'master' into feature_engineering
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimeF committed Mar 24, 2021
2 parents 7fa011f + 2b1efba commit b7c6657
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 135 deletions.
28 changes: 0 additions & 28 deletions agent_code/auto_bomber/auto_bomber_config.py

This file was deleted.

15 changes: 8 additions & 7 deletions agent_code/auto_bomber/callbacks.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import random
import agent_code.auto_bomber.auto_bomber_config as config

import numpy as np

from agent_code.auto_bomber.model import LinearAutoBomberModel
from agent_code.auto_bomber.feature_engineering import state_to_features
from agent_code.auto_bomber.model import LinearAutoBomberModel


def setup(self):
"""
Expand Down Expand Up @@ -33,14 +33,15 @@ def act(self, game_state: dict) -> str:
:return: The action to take as a string.
"""

if self.train and config.POLICY == 'SOFTMAX':
hyper_parameters = self.model.hyper_parameters
if self.train and hyper_parameters["policy"] == 'SOFTMAX':
self.model.select_best_action(game_state, self, softmax=True)
elif self.train and random.random() < config.EPSILON:
if config.POLICY == 'GREEDY':
elif self.train and random.random() < hyper_parameters["epsilon"]:
if hyper_parameters["policy"] == 'GREEDY':
self.logger.debug("Choosing action purely at random.")
# 80%: walk in any direction. 10% wait. 10% bomb.
return np.random.choice(config.ACTIONS, p=[.2, .2, .2, .2, .1, .1])
elif config.POLICY == 'IANN':
return np.random.choice(hyper_parameters["actions"], p=[.2, .2, .2, .2, .1, .1])
elif hyper_parameters["policy"] == 'IANN':
self.model.select_best_action(game_state, self, softmax=True)
else:
self.logger.debug("Querying model for action.")
Expand Down
28 changes: 28 additions & 0 deletions agent_code/auto_bomber/default_hyper_parameters.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"actions": [
"UP",
"RIGHT",
"DOWN",
"LEFT",
"WAIT",
"BOMB"
],
"epsilon": 0.35,
"discount": 0.95,
"learning_rate": 0.01,
"policy": "SOFTMAX",
"temperature": 0.5,
"region_size": 2,
"region_time_tolerance": 8,
"game_rewards": {
"CRATE_DESTROYED": 20,
"COIN_FOUND": 20,
"COIN_COLLECTED": 50,
"KILLED_OPPONENT": 200,
"INVALID_ACTION": -5,
"KILLED_SELF": -300,
"GOT_KILLED": -200,
"SURVIVED_ROUND": 300,
"SAME_REGION": -20
}
}
54 changes: 37 additions & 17 deletions agent_code/auto_bomber/model.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import shutil
import json
import os
import pickle
import shutil
from pathlib import Path

import numpy as np
from tensorboardX import SummaryWriter

import agent_code.auto_bomber.model_path as model_path
from agent_code.auto_bomber.transitions import Transitions
import agent_code.auto_bomber.auto_bomber_config as config


def get_model_dir():
try:
return os.environ["MODEL_DIR"]
except KeyError as e:
return model_path.MODEL_DIR


class LinearAutoBomberModel:
Expand All @@ -15,28 +24,37 @@ def __init__(self, train, feature_extractor):
self.weights = None
self.feature_extractor = feature_extractor

if config.MODEL_DIR and Path(config.MODEL_DIR).is_dir():
self.model_dir = Path(config.MODEL_DIR)
elif config.MODEL_DIR and not Path(config.MODEL_DIR).is_dir():
model_dir = get_model_dir()
if model_dir and Path(model_dir).is_dir():
self.model_dir = Path(model_dir)
elif model_dir and not Path(model_dir).is_dir():
raise FileNotFoundError("The specified model directory does not exist!\nIf you wish to train a NEW model"
"set parameter to None, otherwise specify a valid model directory.")
elif not self.train and not config.MODEL_DIR:
elif not self.train and not model_dir:
raise ValueError("No model directory has been specified.\n A model directory is required for inference.")
else:
model_index = sorted([int(x.stem) for x in Path(config.MODELS_ROOT).iterdir() if x.is_dir()])[-1]
root_dir = Path(model_path.MODELS_ROOT)
root_dir.mkdir(parents=True, exist_ok=True)
existing_subdirs = sorted([int(x.stem) for x in root_dir.iterdir() if x.is_dir()])

model_index = existing_subdirs[-1] if existing_subdirs else -1
model_index += 1
self.model_dir = Path(config.MODELS_ROOT) / str(model_index)
self.model_dir = Path(model_path.MODELS_ROOT) / str(model_index)
self.model_dir.mkdir()
# Copy configuration file for logging purposes
shutil.copy(Path("default_hyper_parameters.json"), self.model_dir / "hyper_parameters.json")

self.weights_path = self.model_dir / "weights.pt"
if self.weights_path.is_file():
with self.weights_path.open(mode="rb") as file:
self.weights = pickle.load(file)

if self.train:
# Copy configuration file for logging purposes
shutil.copy(Path("./auto_bomber_config.py"), self.model_dir / "config.py")
hyper_parameters_path = self.model_dir / "hyper_parameters.json"
if hyper_parameters_path.is_file():
with hyper_parameters_path.open(mode="rb") as file:
self.hyper_parameters = json.load(file)

if self.train:
self.writer = SummaryWriter(logdir=f"./runs/exp{self.model_dir.stem}")

def store(self):
Expand All @@ -51,17 +69,18 @@ def select_best_action(self, game_state: dict, agent_self, softmax=False):

if softmax:
sort_actions = q_action_values.argsort()
p = np.exp(sort_actions / config.TEMP) / np.sum(np.exp(sort_actions / config.TEMP))
temp = self.hyper_parameters["temperature"]
p = np.exp(sort_actions / temp) / np.sum(np.exp(sort_actions / temp))
choice = np.random.choice(sort_actions, p=p)
else:
top_3_actions = q_action_values.argsort()[-3:][::-1]
choice = np.random.choice(top_3_actions, p=[0.9, 0.05, 0.05])
return config.ACTIONS[choice]
return self.hyper_parameters["actions"][choice]

def fit_model_with_transition_batch(self, transitions: Transitions, round: int):
loss = []
numpy_transitions = transitions.to_numpy_transitions()
for action_id, action in enumerate(config.ACTIONS):
numpy_transitions = transitions.to_numpy_transitions(self.hyper_parameters)
for action_id, action in enumerate(self.hyper_parameters["actions"]):
x_all_t, y_all_t = numpy_transitions.get_features_and_value_estimates(action)

if x_all_t.size != 0:
Expand All @@ -70,7 +89,7 @@ def fit_model_with_transition_batch(self, transitions: Transitions, round: int):
loss.append(np.mean(residuals ** 2))
q_grad = np.dot(x_all_t.T, residuals)

weight_updates = config.LEARNING_RATE / y_all_t.shape[0] * q_grad
weight_updates = self.hyper_parameters["learning_rate"] / y_all_t.shape[0] * q_grad
self.weights[action_id] += weight_updates

mean_loss = np.mean(loss)
Expand All @@ -83,4 +102,5 @@ def init_if_needed(self, features_x, agent_self):
agent_self.logger.info("Model is empty init with random weights.")

# Xavier weights initialization
self.weights = np.random.rand(len(config.ACTIONS), len(features_x)) * np.sqrt(1 / len(features_x))
self.weights = np.random.rand(len(self.hyper_parameters["actions"]),
len(features_x)) * np.sqrt(1 / len(features_x))
2 changes: 2 additions & 0 deletions agent_code/auto_bomber/model_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
MODELS_ROOT = "./models"
MODEL_DIR = None
65 changes: 0 additions & 65 deletions agent_code/auto_bomber/test_transitions.py

This file was deleted.

24 changes: 12 additions & 12 deletions agent_code/auto_bomber/train.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import numpy as np
from collections import namedtuple, defaultdict
from queue import Queue
from typing import List

from agent_code.auto_bomber.feature_engineering import state_to_features
from agent_code.auto_bomber import custom_events as ce

from agent_code.auto_bomber.feature_engineering import state_to_features
# This is only an example!
from agent_code.auto_bomber.transitions import Transitions
import agent_code.auto_bomber.auto_bomber_config as config
from queue import Queue


def setup_training(self):
Expand All @@ -21,7 +17,9 @@ def setup_training(self):
"""
# Example: Setup an array that will note transition tuples
self.transitions = Transitions(state_to_features)
self.q = Queue(maxsize=config.REGION_TIME_TOLERANCE)

self.q = Queue(maxsize=self.model.hyper_parameters["region_time_tolerance"])


def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_state: dict, events: List[str]):
"""
Expand All @@ -42,15 +40,15 @@ def game_events_occurred(self, old_game_state: dict, last_action: str, new_game_
:param events: The events that occurred when going from `old_game_state` to `new_game_state`
"""
self.logger.debug(f'Encountered game event(s) {", ".join(map(repr, events))} in step {new_game_state["step"]}')

# state_to_features is defined in callbacks.py
self.transitions.add_transition(old_game_state, last_action, new_game_state, reward_from_events(self, events))
# Punishment, if agent is still in the same radius after certain time steps
new_position = new_game_state["self"][3]
region_size = self.model.hyper_parameters["region_size"]
if self.q.full():
old_position = self.q.get()
if (old_position[0] - config.REGION_SIZE <= new_position[0] <= old_position[0] + config.REGION_SIZE) \
or (old_position[1] - config.REGION_SIZE <= new_position[1] <= old_position[1] + config.REGION_SIZE):
if (old_position[0] - region_size <= new_position[0] <= old_position[0] + region_size) \
or (old_position[1] - region_size <= new_position[1] <= old_position[1] + region_size):
events.append(ce.SAME_REGION)
self.q.put(new_position)

Expand Down Expand Up @@ -87,9 +85,11 @@ def reward_from_events(self, events: List[str]) -> int:
certain behavior.
"""
# q: how to determine the winner?

rewards_dict = self.model.hyper_parameters["game_rewards"]
reward_sum = 0
for event in events:
if event in config.game_rewards:
reward_sum += config.game_rewards[event]
if event in rewards_dict:
reward_sum += rewards_dict[event]
self.logger.info(f"Awarded {reward_sum} for events {', '.join(events)}")
return reward_sum
12 changes: 6 additions & 6 deletions agent_code/auto_bomber/transitions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
import agent_code.auto_bomber.auto_bomber_config as config


class Transitions:
Expand All @@ -17,8 +16,8 @@ def add_transition(self, old_game_state, action, new_game_state, rewards):
self.next_states.append(self.feature_extractor(new_game_state))
self.rewards.append(rewards)

def to_numpy_transitions(self):
return NumpyTransitions(self)
def to_numpy_transitions(self, hyper_parameters):
return NumpyTransitions(self, hyper_parameters)

def clear(self):
self.states.clear()
Expand All @@ -29,11 +28,12 @@ def clear(self):

class NumpyTransitions:
# todo add hyperparam for batch size to support TD-n-step and monte-carlo
def __init__(self, transitions):
def __init__(self, transitions, hyper_parameters):
self.states = np.asarray(transitions.states, dtype=np.float32)
self.actions = np.asarray(transitions.actions)
self.next_states = np.asarray(transitions.next_states, dtype=np.float32)
self.rewards = np.asarray(transitions.rewards, dtype=np.float32)
self.hyper_parameters = hyper_parameters

def get_time_steps_for_action(self, action):
return np.argwhere(self.actions == action)
Expand All @@ -47,6 +47,6 @@ def get_features_and_value_estimates(self, action):

def monte_carlo_value_estimation(self, time_step_start: int):
relevant_rewards = self.rewards[time_step_start:]
discounts = np.fromfunction(lambda i: config.DISCOUNT ** i, shape=(len(relevant_rewards),), dtype=np.float32)
discounts = np.fromfunction(lambda i: self.hyper_parameters["discount"] ** i,
shape=(len(relevant_rewards),), dtype=np.float32)
return np.sum(discounts * relevant_rewards)

3 changes: 3 additions & 0 deletions train_scripts/coins_only.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

python3 main.py play --agents auto_bomber --train 1 --n-rounds 100000 --no-gui

0 comments on commit b7c6657

Please sign in to comment.