-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e130909
commit 918e660
Showing
4 changed files
with
236 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import copy | ||
from typing import Optional | ||
|
||
import keras | ||
import numpy as np | ||
import tensorflow as tf | ||
|
||
from rl_util import Experience, ReplayBuffer | ||
from rl_util.layers import NoisyDense | ||
|
||
|
||
class Mario: | ||
"""Agent that learns to play Super Mario Bros using Noisy Double Deep Q-Networks (Noisy DDQN).""" | ||
_GAMMA = 0.9 # discount factor for future rewards | ||
_LEARNING_RATE = 0.001 # learning rate for q-network | ||
_BATCH_SIZE = 32 # no. of experiences to sample in each training update | ||
_SYNC_EVERY = 10000 # no. of calls to learn() before syncing target network with online network | ||
_FREQ_LEARN = 1 # no. of calls to learn() before updating online network | ||
_LEARN_START = 1000 # no. of experiences in replay buffer before learning starts | ||
_REPLAY_BUFFER_SIZE = 100000 # no. of experiences to store in replay buffer | ||
|
||
def __init__(self, state_shape: tuple, action_dim: int): | ||
""" | ||
:param state_shape: Shape of the state space (image and last action). | ||
:param action_dim: Shape of the action space. | ||
""" | ||
self._action_dim = action_dim | ||
|
||
# online network | ||
# input image and last action | ||
input_img = keras.layers.Input(shape=state_shape[0], dtype='float32') | ||
input_last_action = keras.layers.Input(shape=state_shape[1], dtype='float32') | ||
# network for image | ||
output_img = keras.layers.Permute((2, 3, 1))(input_img) | ||
output_img = keras.layers.Conv2D(filters=32, kernel_size=8, strides=4, activation='relu')(output_img) | ||
output_img = keras.layers.Conv2D(filters=64, kernel_size=4, strides=2, activation='relu')(output_img) | ||
output_img = keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, activation='relu')(output_img) | ||
output_img = keras.layers.Flatten()(output_img) | ||
# network for last action | ||
output_last_action = keras.layers.Flatten()(input_last_action) | ||
output_last_action = NoisyDense(units=32, activation='relu')(output_last_action) | ||
# concatenate networks | ||
outputs = keras.layers.Concatenate()([output_img, output_last_action]) | ||
outputs = NoisyDense(units=512, activation='relu')(outputs) | ||
q_values = NoisyDense(units=self._action_dim)(outputs) | ||
self._q_online = keras.Model(inputs=[input_img, input_last_action], outputs=q_values) | ||
|
||
# target network | ||
self._q_target = copy.deepcopy(self._q_online) | ||
self._q_target.trainable = False | ||
|
||
self._optimizer = keras.optimizers.Adam(learning_rate=self._LEARNING_RATE, epsilon=0.01/self._BATCH_SIZE) | ||
|
||
self.memory = ReplayBuffer(size=self._REPLAY_BUFFER_SIZE) | ||
self._cnt_called_learn = 0 | ||
|
||
@property | ||
def exploration_rate(self): | ||
"""Returns the exploration rate. Noisy DDQN does not use epsilon-greedy, so always return 0.0.""" | ||
return 0.0 | ||
|
||
@property | ||
def cnt_called_learn(self): | ||
"""Returns the number of times the learn() method was called.""" | ||
return self._cnt_called_learn | ||
|
||
def act(self, state, train=False) -> (int, Optional[np.ndarray]): | ||
"""Acting Policy of the Mario Agent given an observation.""" | ||
action_values = self._q_online((np.array([state[0]]), np.array([state[1]]))) | ||
action_idx = np.argmax(action_values, axis=1) | ||
return int(action_idx), action_values[0] | ||
|
||
def cache(self, exp: Experience): | ||
"""Cache the experience into memory buffer""" | ||
self.memory.append(exp) | ||
|
||
def learn(self) -> Optional[list]: | ||
"""Sample experiences from memory and run one iteration of gradient descent. | ||
If memory is not yet full enough to sample a batch, no learning is done and None is returned. | ||
:return: The loss on this gradient step if learning was done, else None. | ||
""" | ||
self._cnt_called_learn += 1 | ||
|
||
if self._cnt_called_learn % self._SYNC_EVERY == 0: | ||
self._q_target.set_weights(self._q_online.get_weights()) | ||
|
||
if (self._cnt_called_learn % self._FREQ_LEARN != 0 or | ||
self._cnt_called_learn < self._LEARN_START or | ||
len(self.memory) < self._BATCH_SIZE): | ||
return None | ||
|
||
experiences = self.memory.sample(self._BATCH_SIZE) | ||
states_img = np.array([exp.state[0] for exp in experiences]) # [batch_size, steps, width, height] | ||
states_last_action = np.array([exp.state[1] for exp in experiences]) # [batch_size, steps, action_dim] | ||
states = [states_img, states_last_action] | ||
next_states_img = np.array([exp.next_state[0] for exp in experiences]) # [batch_size, width, height, steps] | ||
next_states_last_action = np.array([exp.next_state[1] for exp in experiences]) # [batch_size, steps, action_dim] | ||
next_states = [next_states_img, next_states_last_action] | ||
actions = np.array([exp.action for exp in experiences]) # [batch_size,] | ||
next_q_online_values = self._q_online(next_states) # [batch_size, action_dim] | ||
best_next_actions = np.argmax(next_q_online_values, axis=1) # [batch_size,] | ||
next_q_target_values = self._q_target(next_states) # [batch_size, action_dim] | ||
td_targets = np.array([exp.reward + (1 - exp.done) * self._GAMMA * next_q_target[best_next_action] | ||
for exp, next_q_target, best_next_action | ||
in zip(experiences, next_q_target_values, best_next_actions)]) # [batch_size,] | ||
with tf.GradientTape() as tape: | ||
q_values = self._q_online(states) # [batch_size, action_dim] | ||
one_hot_actions = tf.one_hot(actions, self._action_dim) # [batch_size, action_dim] | ||
q_values = tf.reduce_sum(q_values * one_hot_actions, axis=1) # [batch_size,] | ||
td_errors = td_targets - q_values # [batch_size,] | ||
loss = tf.reduce_mean(tf.square(td_errors)) | ||
grads = tape.gradient(loss, self._q_online.trainable_variables) | ||
self._optimizer.apply_gradients(zip(grads, self._q_online.trainable_variables)) | ||
return loss |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from noisy_dence import NoisyDense |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
from typing import Optional | ||
|
||
import numpy as np | ||
import tensorflow as tf | ||
|
||
|
||
class NoisyDense(tf.keras.layers.Layer): | ||
"""Noisy dense layer. | ||
See https://arxiv.org/abs/1706.10295 for details. | ||
""" | ||
def __init__(self, units: int, activation: Optional[str] = None, sigma_0=0.5, **kwargs): | ||
""" | ||
:param units: Number of neurons. | ||
:param activation: Activation function. | ||
:param kwargs: Additional arguments to pass to the keras.layers.Layer superclass. | ||
""" | ||
super().__init__(**kwargs) | ||
self.units = units | ||
self.activation = tf.keras.activations.get(activation) | ||
self._sigma_0 = sigma_0 | ||
self._mu_w = None | ||
self._sigma_w = None | ||
self._epsilon_w = None | ||
self._mu_b = None | ||
self._sigma_b = None | ||
self._epsilon_b = None | ||
|
||
def build(self, input_shape): | ||
"""Builds the layer.""" | ||
n_input = input_shape[-1] | ||
init_mu_min = -1/np.sqrt(n_input) | ||
init_mu_max = 1/np.sqrt(n_input) | ||
init_sigma = self._sigma_0 / np.sqrt(n_input) | ||
self._mu_w = self.add_weight(shape=(n_input, self.units), | ||
initializer=tf.keras.initializers.RandomUniform(init_mu_min, init_mu_max), | ||
trainable=self.trainable, name='mu_w') | ||
self._sigma_w = self.add_weight(shape=(n_input, self.units), | ||
initializer=tf.keras.initializers.Constant(init_sigma), | ||
trainable=self.trainable, name='sigma_w') | ||
self._mu_b = self.add_weight(shape=(self.units,), | ||
initializer=tf.keras.initializers.RandomUniform(init_mu_min, init_mu_max), | ||
trainable=True, name='mu_b') | ||
self._sigma_b = self.add_weight(shape=(self.units,), | ||
initializer=tf.keras.initializers.Constant(init_sigma), | ||
trainable=True, name='sigma_b') | ||
|
||
dtype = self._mu_w.dtype | ||
epsilon_in = self._f(tf.random.normal(shape=tf.shape(self._mu_w.shape[0], 1), dtype=dtype)) | ||
epsilon_out = self._f(tf.random.normal(shape=tf.shape(1, self._mu_w.shape[1]), dtype=dtype)) | ||
self._epsilon_w = tf.matmul(epsilon_in, epsilon_out) | ||
self._epsilon_b = epsilon_out | ||
|
||
super().build(input_shape) | ||
|
||
def call(self, inputs, **kwargs) -> tf.Tensor: | ||
"""Calls the layer. | ||
:param inputs: Input tensor. | ||
:param kwargs: Additional arguments. | ||
:return: Output tensor. | ||
""" | ||
w = self._mu_w + self._sigma_w * self._epsilon_w | ||
b = self._mu_b + self._sigma_b * self._epsilon_b | ||
output = tf.matmul(inputs, w) + b | ||
if self.activation is not None: | ||
output = self.activation(output) | ||
return output | ||
|
||
@staticmethod | ||
def _f(x: tf.Tensor) -> tf.Tensor: | ||
"""Applies the f function to the given tensor. | ||
:param x: Input tensor. | ||
:return: Output tensor. | ||
""" | ||
return tf.multiply(tf.sign(x), tf.sqrt(tf.abs(x))) | ||
|
||
def get_config(self): | ||
"""Returns the configuration of the layer.""" | ||
config = super().get_config() | ||
config.update({ | ||
'units': self.units, | ||
'activation': tf.keras.activations.serialize(self.activation), | ||
'sigma_0': self._sigma_0 | ||
}) | ||
return config | ||
|
||
@classmethod | ||
def from_config(cls, config): | ||
"""Creates a layer from its configuration.""" | ||
return cls(**config) | ||
|
||
@property | ||
def sigma_0(self): | ||
"""Returns the sigma_0 parameter.""" | ||
return self._sigma_0 | ||
|
||
@property | ||
def mu_w(self): | ||
"""Returns the mu_w parameter.""" | ||
return self._mu_w | ||
|
||
@property | ||
def sigma_w(self): | ||
"""Returns the sigma_w parameter.""" | ||
return self._sigma_w | ||
|
||
@property | ||
def mu_b(self): | ||
"""Returns the mu_b parameter.""" | ||
return self._mu_b | ||
|
||
@property | ||
def sigma_b(self): | ||
"""Returns the sigma_b parameter.""" | ||
return self._sigma_b |