Add noisy DQN

KyosukeIchikawa · Sep 30, 2023 · 918e660 · 918e660
1 parent e130909
commit 918e660
Show file tree

Hide file tree

Showing 4 changed files with 236 additions and 2 deletions.
diff --git a/mario_rl/main.py b/mario_rl/main.py
@@ -110,10 +110,12 @@ def __init__(self):
             self._video_max_value = Mario._V_MAX
         elif args.rl == "ddqn":
             from mario_ddqn import Mario
-        elif args.rl == "prioritized_ddqn":
-            from mario_prioritized_ddqn import Mario
         elif args.rl == "dueling_ddqn":
             from mario_dueling_ddqn import Mario
+        elif args.rl == "noisy_dqn":
+            from mario_noisy_dqn import Mario
+        elif args.rl == "prioritized_ddqn":
+            from mario_prioritized_ddqn import Mario
         else:
             raise ValueError(f"Unknown RL algorithm: {args.rl}")
         state_img_shape = env.observation_space.shape

diff --git a/mario_rl/mario_noisy_dqn.py b/mario_rl/mario_noisy_dqn.py
@@ -0,0 +1,115 @@
+import copy
+from typing import Optional
+
+import keras
+import numpy as np
+import tensorflow as tf
+
+from rl_util import Experience, ReplayBuffer
+from rl_util.layers import NoisyDense
+
+
+class Mario:
+    """Agent that learns to play Super Mario Bros using Noisy Double Deep Q-Networks (Noisy DDQN)."""
+    _GAMMA = 0.9  # discount factor for future rewards
+    _LEARNING_RATE = 0.001  # learning rate for q-network
+    _BATCH_SIZE = 32  # no. of experiences to sample in each training update
+    _SYNC_EVERY = 10000  # no. of calls to learn() before syncing target network with online network
+    _FREQ_LEARN = 1  # no. of calls to learn() before updating online network
+    _LEARN_START = 1000  # no. of experiences in replay buffer before learning starts
+    _REPLAY_BUFFER_SIZE = 100000  # no. of experiences to store in replay buffer
+
+    def __init__(self, state_shape: tuple, action_dim: int):
+        """
+        :param state_shape: Shape of the state space (image and last action).
+        :param action_dim: Shape of the action space.
+        """
+        self._action_dim = action_dim
+
+        # online network
+        # input image and last action
+        input_img = keras.layers.Input(shape=state_shape[0], dtype='float32')
+        input_last_action = keras.layers.Input(shape=state_shape[1], dtype='float32')
+        # network for image
+        output_img = keras.layers.Permute((2, 3, 1))(input_img)
+        output_img = keras.layers.Conv2D(filters=32, kernel_size=8, strides=4, activation='relu')(output_img)
+        output_img = keras.layers.Conv2D(filters=64, kernel_size=4, strides=2, activation='relu')(output_img)
+        output_img = keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, activation='relu')(output_img)
+        output_img = keras.layers.Flatten()(output_img)
+        # network for last action
+        output_last_action = keras.layers.Flatten()(input_last_action)
+        output_last_action = NoisyDense(units=32, activation='relu')(output_last_action)
+        # concatenate networks
+        outputs = keras.layers.Concatenate()([output_img, output_last_action])
+        outputs = NoisyDense(units=512, activation='relu')(outputs)
+        q_values = NoisyDense(units=self._action_dim)(outputs)
+        self._q_online = keras.Model(inputs=[input_img, input_last_action], outputs=q_values)
+
+        # target network
+        self._q_target = copy.deepcopy(self._q_online)
+        self._q_target.trainable = False
+
+        self._optimizer = keras.optimizers.Adam(learning_rate=self._LEARNING_RATE, epsilon=0.01/self._BATCH_SIZE)
+
+        self.memory = ReplayBuffer(size=self._REPLAY_BUFFER_SIZE)
+        self._cnt_called_learn = 0
+
+    @property
+    def exploration_rate(self):
+        """Returns the exploration rate. Noisy DDQN does not use epsilon-greedy, so always return 0.0."""
+        return 0.0
+
+    @property
+    def cnt_called_learn(self):
+        """Returns the number of times the learn() method was called."""
+        return self._cnt_called_learn
+
+    def act(self, state, train=False) -> (int, Optional[np.ndarray]):
+        """Acting Policy of the Mario Agent given an observation."""
+        action_values = self._q_online((np.array([state[0]]), np.array([state[1]])))
+        action_idx = np.argmax(action_values, axis=1)
+        return int(action_idx), action_values[0]
+
+    def cache(self, exp: Experience):
+        """Cache the experience into memory buffer"""
+        self.memory.append(exp)
+
+    def learn(self) -> Optional[list]:
+        """Sample experiences from memory and run one iteration of gradient descent.
+        If memory is not yet full enough to sample a batch, no learning is done and None is returned.
+
+        :return: The loss on this gradient step if learning was done, else None.
+        """
+        self._cnt_called_learn += 1
+
+        if self._cnt_called_learn % self._SYNC_EVERY == 0:
+            self._q_target.set_weights(self._q_online.get_weights())
+
+        if (self._cnt_called_learn % self._FREQ_LEARN != 0 or
+                self._cnt_called_learn < self._LEARN_START or
+                len(self.memory) < self._BATCH_SIZE):
+            return None
+
+        experiences = self.memory.sample(self._BATCH_SIZE)
+        states_img = np.array([exp.state[0] for exp in experiences])  # [batch_size, steps, width, height]
+        states_last_action = np.array([exp.state[1] for exp in experiences])  # [batch_size, steps, action_dim]
+        states = [states_img, states_last_action]
+        next_states_img = np.array([exp.next_state[0] for exp in experiences])  # [batch_size, width, height, steps]
+        next_states_last_action = np.array([exp.next_state[1] for exp in experiences])  # [batch_size, steps, action_dim]
+        next_states = [next_states_img, next_states_last_action]
+        actions = np.array([exp.action for exp in experiences])  # [batch_size,]
+        next_q_online_values = self._q_online(next_states)  # [batch_size, action_dim]
+        best_next_actions = np.argmax(next_q_online_values, axis=1)  # [batch_size,]
+        next_q_target_values = self._q_target(next_states)  # [batch_size, action_dim]
+        td_targets = np.array([exp.reward + (1 - exp.done) * self._GAMMA * next_q_target[best_next_action]
+                               for exp, next_q_target, best_next_action
+                               in zip(experiences, next_q_target_values, best_next_actions)])  # [batch_size,]
+        with tf.GradientTape() as tape:
+            q_values = self._q_online(states)  # [batch_size, action_dim]
+            one_hot_actions = tf.one_hot(actions, self._action_dim)  # [batch_size, action_dim]
+            q_values = tf.reduce_sum(q_values * one_hot_actions, axis=1)  # [batch_size,]
+            td_errors = td_targets - q_values  # [batch_size,]
+            loss = tf.reduce_mean(tf.square(td_errors))
+        grads = tape.gradient(loss, self._q_online.trainable_variables)
+        self._optimizer.apply_gradients(zip(grads, self._q_online.trainable_variables))
+        return loss
diff --git a/mario_rl/rl_util/layers/__init__.py b/mario_rl/rl_util/layers/__init__.py
@@ -0,0 +1 @@
+from noisy_dence import NoisyDense
diff --git a/mario_rl/rl_util/layers/noisy_dence.py b/mario_rl/rl_util/layers/noisy_dence.py
@@ -0,0 +1,116 @@
+from typing import Optional
+
+import numpy as np
+import tensorflow as tf
+
+
+class NoisyDense(tf.keras.layers.Layer):
+    """Noisy dense layer.
+    See https://arxiv.org/abs/1706.10295 for details.
+    """
+    def __init__(self, units: int, activation: Optional[str] = None, sigma_0=0.5, **kwargs):
+        """
+        :param units: Number of neurons.
+        :param activation: Activation function.
+        :param kwargs: Additional arguments to pass to the keras.layers.Layer superclass.
+        """
+        super().__init__(**kwargs)
+        self.units = units
+        self.activation = tf.keras.activations.get(activation)
+        self._sigma_0 = sigma_0
+        self._mu_w = None
+        self._sigma_w = None
+        self._epsilon_w = None
+        self._mu_b = None
+        self._sigma_b = None
+        self._epsilon_b = None
+
+    def build(self, input_shape):
+        """Builds the layer."""
+        n_input = input_shape[-1]
+        init_mu_min = -1/np.sqrt(n_input)
+        init_mu_max = 1/np.sqrt(n_input)
+        init_sigma = self._sigma_0 / np.sqrt(n_input)
+        self._mu_w = self.add_weight(shape=(n_input, self.units),
+                                     initializer=tf.keras.initializers.RandomUniform(init_mu_min, init_mu_max),
+                                     trainable=self.trainable, name='mu_w')
+        self._sigma_w = self.add_weight(shape=(n_input, self.units),
+                                        initializer=tf.keras.initializers.Constant(init_sigma),
+                                        trainable=self.trainable, name='sigma_w')
+        self._mu_b = self.add_weight(shape=(self.units,),
+                                     initializer=tf.keras.initializers.RandomUniform(init_mu_min, init_mu_max),
+                                     trainable=True, name='mu_b')
+        self._sigma_b = self.add_weight(shape=(self.units,),
+                                        initializer=tf.keras.initializers.Constant(init_sigma),
+                                        trainable=True, name='sigma_b')
+
+        dtype = self._mu_w.dtype
+        epsilon_in = self._f(tf.random.normal(shape=tf.shape(self._mu_w.shape[0], 1), dtype=dtype))
+        epsilon_out = self._f(tf.random.normal(shape=tf.shape(1, self._mu_w.shape[1]), dtype=dtype))
+        self._epsilon_w = tf.matmul(epsilon_in, epsilon_out)
+        self._epsilon_b = epsilon_out
+
+        super().build(input_shape)
+
+    def call(self, inputs, **kwargs) -> tf.Tensor:
+        """Calls the layer.
+
+        :param inputs: Input tensor.
+        :param kwargs: Additional arguments.
+        :return: Output tensor.
+        """
+        w = self._mu_w + self._sigma_w * self._epsilon_w
+        b = self._mu_b + self._sigma_b * self._epsilon_b
+        output = tf.matmul(inputs, w) + b
+        if self.activation is not None:
+            output = self.activation(output)
+        return output
+
+    @staticmethod
+    def _f(x: tf.Tensor) -> tf.Tensor:
+        """Applies the f function to the given tensor.
+
+        :param x: Input tensor.
+        :return: Output tensor.
+        """
+        return tf.multiply(tf.sign(x), tf.sqrt(tf.abs(x)))
+
+    def get_config(self):
+        """Returns the configuration of the layer."""
+        config = super().get_config()
+        config.update({
+            'units': self.units,
+            'activation': tf.keras.activations.serialize(self.activation),
+            'sigma_0': self._sigma_0
+        })
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates a layer from its configuration."""
+        return cls(**config)
+
+    @property
+    def sigma_0(self):
+        """Returns the sigma_0 parameter."""
+        return self._sigma_0
+
+    @property
+    def mu_w(self):
+        """Returns the mu_w parameter."""
+        return self._mu_w
+
+    @property
+    def sigma_w(self):
+        """Returns the sigma_w parameter."""
+        return self._sigma_w
+
+    @property
+    def mu_b(self):
+        """Returns the mu_b parameter."""
+        return self._mu_b
+
+    @property
+    def sigma_b(self):
+        """Returns the sigma_b parameter."""
+        return self._sigma_b