Skip to content

Commit

Permalink
Add noisy DQN
Browse files Browse the repository at this point in the history
  • Loading branch information
KyosukeIchikawa committed Sep 30, 2023
1 parent e130909 commit 918e660
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 2 deletions.
6 changes: 4 additions & 2 deletions mario_rl/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,12 @@ def __init__(self):
self._video_max_value = Mario._V_MAX
elif args.rl == "ddqn":
from mario_ddqn import Mario
elif args.rl == "prioritized_ddqn":
from mario_prioritized_ddqn import Mario
elif args.rl == "dueling_ddqn":
from mario_dueling_ddqn import Mario
elif args.rl == "noisy_dqn":
from mario_noisy_dqn import Mario
elif args.rl == "prioritized_ddqn":
from mario_prioritized_ddqn import Mario
else:
raise ValueError(f"Unknown RL algorithm: {args.rl}")
state_img_shape = env.observation_space.shape
Expand Down
115 changes: 115 additions & 0 deletions mario_rl/mario_noisy_dqn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import copy
from typing import Optional

import keras
import numpy as np
import tensorflow as tf

from rl_util import Experience, ReplayBuffer
from rl_util.layers import NoisyDense


class Mario:
"""Agent that learns to play Super Mario Bros using Noisy Double Deep Q-Networks (Noisy DDQN)."""
_GAMMA = 0.9 # discount factor for future rewards
_LEARNING_RATE = 0.001 # learning rate for q-network
_BATCH_SIZE = 32 # no. of experiences to sample in each training update
_SYNC_EVERY = 10000 # no. of calls to learn() before syncing target network with online network
_FREQ_LEARN = 1 # no. of calls to learn() before updating online network
_LEARN_START = 1000 # no. of experiences in replay buffer before learning starts
_REPLAY_BUFFER_SIZE = 100000 # no. of experiences to store in replay buffer

def __init__(self, state_shape: tuple, action_dim: int):
"""
:param state_shape: Shape of the state space (image and last action).
:param action_dim: Shape of the action space.
"""
self._action_dim = action_dim

# online network
# input image and last action
input_img = keras.layers.Input(shape=state_shape[0], dtype='float32')
input_last_action = keras.layers.Input(shape=state_shape[1], dtype='float32')
# network for image
output_img = keras.layers.Permute((2, 3, 1))(input_img)
output_img = keras.layers.Conv2D(filters=32, kernel_size=8, strides=4, activation='relu')(output_img)
output_img = keras.layers.Conv2D(filters=64, kernel_size=4, strides=2, activation='relu')(output_img)
output_img = keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, activation='relu')(output_img)
output_img = keras.layers.Flatten()(output_img)
# network for last action
output_last_action = keras.layers.Flatten()(input_last_action)
output_last_action = NoisyDense(units=32, activation='relu')(output_last_action)
# concatenate networks
outputs = keras.layers.Concatenate()([output_img, output_last_action])
outputs = NoisyDense(units=512, activation='relu')(outputs)
q_values = NoisyDense(units=self._action_dim)(outputs)
self._q_online = keras.Model(inputs=[input_img, input_last_action], outputs=q_values)

# target network
self._q_target = copy.deepcopy(self._q_online)
self._q_target.trainable = False

self._optimizer = keras.optimizers.Adam(learning_rate=self._LEARNING_RATE, epsilon=0.01/self._BATCH_SIZE)

self.memory = ReplayBuffer(size=self._REPLAY_BUFFER_SIZE)
self._cnt_called_learn = 0

@property
def exploration_rate(self):
"""Returns the exploration rate. Noisy DDQN does not use epsilon-greedy, so always return 0.0."""
return 0.0

@property
def cnt_called_learn(self):
"""Returns the number of times the learn() method was called."""
return self._cnt_called_learn

def act(self, state, train=False) -> (int, Optional[np.ndarray]):
"""Acting Policy of the Mario Agent given an observation."""
action_values = self._q_online((np.array([state[0]]), np.array([state[1]])))
action_idx = np.argmax(action_values, axis=1)
return int(action_idx), action_values[0]

def cache(self, exp: Experience):
"""Cache the experience into memory buffer"""
self.memory.append(exp)

def learn(self) -> Optional[list]:
"""Sample experiences from memory and run one iteration of gradient descent.
If memory is not yet full enough to sample a batch, no learning is done and None is returned.
:return: The loss on this gradient step if learning was done, else None.
"""
self._cnt_called_learn += 1

if self._cnt_called_learn % self._SYNC_EVERY == 0:
self._q_target.set_weights(self._q_online.get_weights())

if (self._cnt_called_learn % self._FREQ_LEARN != 0 or
self._cnt_called_learn < self._LEARN_START or
len(self.memory) < self._BATCH_SIZE):
return None

experiences = self.memory.sample(self._BATCH_SIZE)
states_img = np.array([exp.state[0] for exp in experiences]) # [batch_size, steps, width, height]
states_last_action = np.array([exp.state[1] for exp in experiences]) # [batch_size, steps, action_dim]
states = [states_img, states_last_action]
next_states_img = np.array([exp.next_state[0] for exp in experiences]) # [batch_size, width, height, steps]
next_states_last_action = np.array([exp.next_state[1] for exp in experiences]) # [batch_size, steps, action_dim]
next_states = [next_states_img, next_states_last_action]
actions = np.array([exp.action for exp in experiences]) # [batch_size,]
next_q_online_values = self._q_online(next_states) # [batch_size, action_dim]
best_next_actions = np.argmax(next_q_online_values, axis=1) # [batch_size,]
next_q_target_values = self._q_target(next_states) # [batch_size, action_dim]
td_targets = np.array([exp.reward + (1 - exp.done) * self._GAMMA * next_q_target[best_next_action]
for exp, next_q_target, best_next_action
in zip(experiences, next_q_target_values, best_next_actions)]) # [batch_size,]
with tf.GradientTape() as tape:
q_values = self._q_online(states) # [batch_size, action_dim]
one_hot_actions = tf.one_hot(actions, self._action_dim) # [batch_size, action_dim]
q_values = tf.reduce_sum(q_values * one_hot_actions, axis=1) # [batch_size,]
td_errors = td_targets - q_values # [batch_size,]
loss = tf.reduce_mean(tf.square(td_errors))
grads = tape.gradient(loss, self._q_online.trainable_variables)
self._optimizer.apply_gradients(zip(grads, self._q_online.trainable_variables))
return loss
1 change: 1 addition & 0 deletions mario_rl/rl_util/layers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from noisy_dence import NoisyDense
116 changes: 116 additions & 0 deletions mario_rl/rl_util/layers/noisy_dence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from typing import Optional

import numpy as np
import tensorflow as tf


class NoisyDense(tf.keras.layers.Layer):
"""Noisy dense layer.
See https://arxiv.org/abs/1706.10295 for details.
"""
def __init__(self, units: int, activation: Optional[str] = None, sigma_0=0.5, **kwargs):
"""
:param units: Number of neurons.
:param activation: Activation function.
:param kwargs: Additional arguments to pass to the keras.layers.Layer superclass.
"""
super().__init__(**kwargs)
self.units = units
self.activation = tf.keras.activations.get(activation)
self._sigma_0 = sigma_0
self._mu_w = None
self._sigma_w = None
self._epsilon_w = None
self._mu_b = None
self._sigma_b = None
self._epsilon_b = None

def build(self, input_shape):
"""Builds the layer."""
n_input = input_shape[-1]
init_mu_min = -1/np.sqrt(n_input)
init_mu_max = 1/np.sqrt(n_input)
init_sigma = self._sigma_0 / np.sqrt(n_input)
self._mu_w = self.add_weight(shape=(n_input, self.units),
initializer=tf.keras.initializers.RandomUniform(init_mu_min, init_mu_max),
trainable=self.trainable, name='mu_w')
self._sigma_w = self.add_weight(shape=(n_input, self.units),
initializer=tf.keras.initializers.Constant(init_sigma),
trainable=self.trainable, name='sigma_w')
self._mu_b = self.add_weight(shape=(self.units,),
initializer=tf.keras.initializers.RandomUniform(init_mu_min, init_mu_max),
trainable=True, name='mu_b')
self._sigma_b = self.add_weight(shape=(self.units,),
initializer=tf.keras.initializers.Constant(init_sigma),
trainable=True, name='sigma_b')

dtype = self._mu_w.dtype
epsilon_in = self._f(tf.random.normal(shape=tf.shape(self._mu_w.shape[0], 1), dtype=dtype))
epsilon_out = self._f(tf.random.normal(shape=tf.shape(1, self._mu_w.shape[1]), dtype=dtype))
self._epsilon_w = tf.matmul(epsilon_in, epsilon_out)
self._epsilon_b = epsilon_out

super().build(input_shape)

def call(self, inputs, **kwargs) -> tf.Tensor:
"""Calls the layer.
:param inputs: Input tensor.
:param kwargs: Additional arguments.
:return: Output tensor.
"""
w = self._mu_w + self._sigma_w * self._epsilon_w
b = self._mu_b + self._sigma_b * self._epsilon_b
output = tf.matmul(inputs, w) + b
if self.activation is not None:
output = self.activation(output)
return output

@staticmethod
def _f(x: tf.Tensor) -> tf.Tensor:
"""Applies the f function to the given tensor.
:param x: Input tensor.
:return: Output tensor.
"""
return tf.multiply(tf.sign(x), tf.sqrt(tf.abs(x)))

def get_config(self):
"""Returns the configuration of the layer."""
config = super().get_config()
config.update({
'units': self.units,
'activation': tf.keras.activations.serialize(self.activation),
'sigma_0': self._sigma_0
})
return config

@classmethod
def from_config(cls, config):
"""Creates a layer from its configuration."""
return cls(**config)

@property
def sigma_0(self):
"""Returns the sigma_0 parameter."""
return self._sigma_0

@property
def mu_w(self):
"""Returns the mu_w parameter."""
return self._mu_w

@property
def sigma_w(self):
"""Returns the sigma_w parameter."""
return self._sigma_w

@property
def mu_b(self):
"""Returns the mu_b parameter."""
return self._mu_b

@property
def sigma_b(self):
"""Returns the sigma_b parameter."""
return self._sigma_b

0 comments on commit 918e660

Please sign in to comment.