From 506df1f982a9ee5850ca518c2b16dae119549e92 Mon Sep 17 00:00:00 2001
From: mikepratt1 <michael.pratt160@gmail.com>
Date: Tue, 26 Mar 2024 11:52:24 +0000
Subject: [PATCH 1/6] agilerl version updates and net_config key updates to
 align with updated version

---
 .github/workflows/linux-tutorials-test.yml  | 27 ++++++++++++++++++++-
 docs/tutorials/agilerl/DQN.md               |  8 +++---
 tutorials/AgileRL/agilerl_dqn_curriculum.py |  8 +++---
 tutorials/AgileRL/agilerl_maddpg.py         |  8 +++---
 tutorials/AgileRL/agilerl_matd3.py          |  2 +-
 tutorials/AgileRL/requirements.txt          |  3 +--
 6 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/linux-tutorials-test.yml b/.github/workflows/linux-tutorials-test.yml
index 858203d13..8b7b7817b 100644
--- a/.github/workflows/linux-tutorials-test.yml
+++ b/.github/workflows/linux-tutorials-test.yml
@@ -17,7 +17,7 @@ jobs:
             fail-fast: false
             matrix:
                 python-version: ['3.8', '3.9', '3.10', '3.11']
-                tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test, AgileRL]  # TODO: fix tutorials and add back Ray
+                tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test]  # TODO: fix tutorials and add back Ray
         steps:
             - uses: actions/checkout@v4
             - name: Set up Python ${{ matrix.python-version }}
@@ -35,3 +35,28 @@ jobs:
                   pip install -e $root_dir[testing]
                   AutoROM -v
                   for f in *.py; do xvfb-run -a -s "-screen 0 1024x768x24" python "$f"; done
+
+    agilerl-tutorial-test:
+        runs-on: ubuntu-latest
+        strategy:
+            fail-fast: false
+            matrix:
+                python-version: ['3.9', '3.10', '3.11']
+                tutorial: [AgileRL]
+        steps:
+            - uses: actions/checkout@v4
+            - name: Set up Python ${{ matrix.python-version }}
+              uses: actions/setup-python@v4
+              with:
+                  python-version: ${{ matrix.python-version }}
+            - name: Install dependencies and run tutorials
+              run: |
+                  sudo apt-get install python3-opengl xvfb parallel
+                  export PATH=/path/to/parallel:$PATH
+                  export root_dir=$(pwd)
+                  cd tutorials/${{ matrix.tutorial }}
+                  pip install -r requirements.txt
+                  pip uninstall -y pettingzoo
+                  pip install -e $root_dir[testing]
+                  AutoROM -v
+                  for f in *.py; do xvfb-run -a -s "-screen 0 1024x768x24" python "$f"; done
\ No newline at end of file
diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md
index 6a7cc3731..5b701f28c 100644
--- a/docs/tutorials/agilerl/DQN.md
+++ b/docs/tutorials/agilerl/DQN.md
@@ -612,10 +612,10 @@ Before we go any further in this tutorial, it would be helpful to define and set
    # Define the network configuration
    NET_CONFIG = {
       "arch": "cnn",  # Network architecture
-      "h_size": [64, 64],  # Actor hidden size
-      "c_size": [128],  # CNN channel size
-      "k_size": [4],  # CNN kernel size
-      "s_size": [1],  # CNN stride size
+      "hidden_size": [64, 64],  # Actor hidden size
+      "channel_size": [128],  # CNN channel size
+      "kernel_size": [4],  # CNN kernel size
+      "stride_size": [1],  # CNN stride size
       "normalize": False,  # Normalize image from range [0,255] to [0,1]
    }
 
diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
index 1b6e86949..a56464c5f 100644
--- a/tutorials/AgileRL/agilerl_dqn_curriculum.py
+++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py
@@ -494,10 +494,10 @@ def outcome(self, action, player, return_length=False):
         # Define the network configuration
         NET_CONFIG = {
             "arch": "cnn",  # Network architecture
-            "h_size": [64, 64],  # Actor hidden size
-            "c_size": [128],  # CNN channel size
-            "k_size": [4],  # CNN kernel size
-            "s_size": [1],  # CNN stride size
+            "hidden_size": [64, 64],  # Actor hidden size
+            "channel_size": [128],  # CNN channel size
+            "kernel_size": [4],  # CNN kernel size
+            "stride_size": [1],  # CNN stride size
             "normalize": False,  # Normalize image from range [0,255] to [0,1]
         }
 
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index 14a93f6e1..edf4eacbf 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -22,10 +22,10 @@
     # Define the network configuration
     NET_CONFIG = {
         "arch": "cnn",  # Network architecture
-        "h_size": [32, 32],  # Network hidden size
-        "c_size": [32, 32],  # CNN channel size
-        "k_size": [3, 3],  # CNN kernel size
-        "s_size": [2, 2],  # CNN stride size
+        "hiden_size": [32, 32],  # Network hidden size
+        "channel_size": [32, 32],  # CNN channel size
+        "kernel_size": [3, 3],  # CNN kernel size
+        "stride_size": [2, 2],  # CNN stride size
         "normalize": True,  # Normalize image from range [0,255] to [0,1]
     }
 
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
index 46aefb5bf..cc6ed9009 100644
--- a/tutorials/AgileRL/agilerl_matd3.py
+++ b/tutorials/AgileRL/agilerl_matd3.py
@@ -21,7 +21,7 @@
     # Define the network configuration
     NET_CONFIG = {
         "arch": "mlp",  # Network architecture
-        "h_size": [32, 32],  # Actor hidden size
+        "hidden_size": [32, 32],  # Actor hidden size
     }
 
     # Define the initial hyperparameters
diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index dbdd050a0..1262ee83c 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,5 +1,4 @@
-agilerl==0.1.21; python_version >= '3.9'
-agilerl==0.1.20; python_version < '3.9'
+agilerl==0.1.22; python_version >= '3.9'
 pettingzoo[classic,atari,mpe]>=1.23.1
 SuperSuit>=3.9.0
 torch>=2.0.1

From ec7bad415e4889ac46008b63ef5e47611526a3a3 Mon Sep 17 00:00:00 2001
From: mikepratt1 <michael.pratt160@gmail.com>
Date: Tue, 26 Mar 2024 11:56:21 +0000
Subject: [PATCH 2/6] pre-commit hooks changes

---
 .github/workflows/linux-tutorials-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux-tutorials-test.yml b/.github/workflows/linux-tutorials-test.yml
index 8b7b7817b..f74a9b3c5 100644
--- a/.github/workflows/linux-tutorials-test.yml
+++ b/.github/workflows/linux-tutorials-test.yml
@@ -59,4 +59,4 @@ jobs:
                   pip uninstall -y pettingzoo
                   pip install -e $root_dir[testing]
                   AutoROM -v
-                  for f in *.py; do xvfb-run -a -s "-screen 0 1024x768x24" python "$f"; done
\ No newline at end of file
+                  for f in *.py; do xvfb-run -a -s "-screen 0 1024x768x24" python "$f"; done

From 82d1d0aad83df49a0968c21977ebffe1611ca754 Mon Sep 17 00:00:00 2001
From: mikepratt1 <michael.pratt160@gmail.com>
Date: Tue, 26 Mar 2024 12:43:01 +0000
Subject: [PATCH 3/6] corrected spelling of hidden_size

---
 tutorials/AgileRL/agilerl_maddpg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index edf4eacbf..37e193f40 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -22,7 +22,7 @@
     # Define the network configuration
     NET_CONFIG = {
         "arch": "cnn",  # Network architecture
-        "hiden_size": [32, 32],  # Network hidden size
+        "hidden_size": [32, 32],  # Network hidden size
         "channel_size": [32, 32],  # CNN channel size
         "kernel_size": [3, 3],  # CNN kernel size
         "stride_size": [2, 2],  # CNN stride size

From fddc36909d19497cb054fbf560618eba6825c178 Mon Sep 17 00:00:00 2001
From: mikepratt1 <michael.pratt160@gmail.com>
Date: Fri, 25 Oct 2024 14:39:43 +0100
Subject: [PATCH 4/6] update agilerl tutorials

---
 docs/tutorials/agilerl/DQN.md               | 242 +++++++--------
 docs/tutorials/agilerl/MADDPG.md            |   2 +-
 tutorials/AgileRL/agilerl_dqn_curriculum.py | 269 ++++++++--------
 tutorials/AgileRL/agilerl_maddpg.py         | 327 ++++++++++----------
 tutorials/AgileRL/agilerl_matd3.py          | 220 +++++++------
 tutorials/AgileRL/render_agilerl_dqn.py     |  37 +--
 tutorials/AgileRL/render_agilerl_maddpg.py  |  36 +--
 tutorials/AgileRL/render_agilerl_matd3.py   |  35 +--
 tutorials/AgileRL/requirements.txt          |   3 +-
 9 files changed, 585 insertions(+), 586 deletions(-)

diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md
index 6a7cc3731..9cd9cb30a 100644
--- a/docs/tutorials/agilerl/DQN.md
+++ b/docs/tutorials/agilerl/DQN.md
@@ -62,7 +62,7 @@ Importing the following packages, functions and classes will enable us to run th
    from agilerl.components.replay_buffer import ReplayBuffer
    from agilerl.hpo.mutation import Mutations
    from agilerl.hpo.tournament import TournamentSelection
-   from agilerl.utils.utils import initialPopulation
+   from agilerl.utils.utils import create_population
    from tqdm import tqdm, trange
 
    from pettingzoo.classic import connect_four_v3
@@ -167,27 +167,23 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
             while not (done or truncation):
                   # Player 0's turn
                   p0_action_mask = observation["action_mask"]
-                  p0_state = np.moveaxis(observation["observation"], [-1], [-3])
-                  p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
-                  p0_state = np.expand_dims(p0_state, 0)
+                  p0_state, p0_state_flipped = transform_and_flip(observation, player = 0)
                   if opponent_first:
                      p0_action = self.env.action_space("player_0").sample(p0_action_mask)
                   else:
                      if self.lesson["warm_up_opponent"] == "random":
-                        p0_action = opponent.getAction(
+                        p0_action = opponent.get_action(
                               p0_action_mask, p1_action, self.lesson["block_vert_coef"]
                         )
                      else:
-                        p0_action = opponent.getAction(player=0)
+                        p0_action = opponent.get_action(player=0)
                   self.step(p0_action)  # Act in environment
                   observation, env_reward, done, truncation, _ = self.last()
-                  p0_next_state = np.moveaxis(observation["observation"], [-1], [-3])
-                  p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0)
-                  p0_next_state = np.expand_dims(p0_next_state, 0)
+                  p0_next_state, p0_next_state_flipped = transform_and_flip(observation, player = 0)
 
                   if done or truncation:
                      reward = self.reward(done=True, player=0)
-                     memory.save2memoryVectEnvs(
+                     memory.save_to_memory_vect_envs(
                         np.concatenate(
                               (p0_state, p1_state, p0_state_flipped, p1_state_flipped)
                         ),
@@ -211,7 +207,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
                   else:  # Play continues
                      if p1_state is not None:
                         reward = self.reward(done=False, player=1)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                               np.concatenate((p1_state, p1_state_flipped)),
                               [p1_action, 6 - p1_action],
                               [reward, reward],
@@ -221,31 +217,25 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
 
                      # Player 1's turn
                      p1_action_mask = observation["action_mask"]
-                     p1_state = np.moveaxis(observation["observation"], [-1], [-3])
-                     p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
-                     p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
-                     p1_state = np.expand_dims(p1_state, 0)
+                     p1_state, p1_state_flipped = transform_and_flip(observation, player = 1)
                      if not opponent_first:
                         p1_action = self.env.action_space("player_1").sample(
                               p1_action_mask
                         )
                      else:
                         if self.lesson["warm_up_opponent"] == "random":
-                              p1_action = opponent.getAction(
+                              p1_action = opponent.get_action(
                                  p1_action_mask, p0_action, LESSON["block_vert_coef"]
                               )
                         else:
-                              p1_action = opponent.getAction(player=1)
+                              p1_action = opponent.get_action(player=1)
                      self.step(p1_action)  # Act in environment
                      observation, env_reward, done, truncation, _ = self.last()
-                     p1_next_state = np.moveaxis(observation["observation"], [-1], [-3])
-                     p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
-                     p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0)
-                     p1_next_state = np.expand_dims(p1_next_state, 0)
+                     p1_next_state, p1_next_state_flipped = transform_and_flip(observation, player = 1)
 
                      if done or truncation:
                         reward = self.reward(done=True, player=1)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                               np.concatenate(
                                  (p0_state, p1_state, p0_state_flipped, p1_state_flipped)
                               ),
@@ -269,7 +259,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
 
                      else:  # Play continues
                         reward = self.reward(done=False, player=0)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                               np.concatenate((p0_state, p0_state_flipped)),
                               [p0_action, 6 - p0_action],
                               [reward, reward],
@@ -431,11 +421,11 @@ When defining the different lessons in our curriculum, we can increase the diffi
          self.env = env.env
          self.difficulty = difficulty
          if self.difficulty == "random":
-            self.getAction = self.random_opponent
+            self.get_action = self.random_opponent
          elif self.difficulty == "weak":
-            self.getAction = self.weak_rule_based_opponent
+            self.get_action = self.weak_rule_based_opponent
          else:
-            self.getAction = self.strong_rule_based_opponent
+            self.get_action = self.strong_rule_based_opponent
          self.num_cols = 7
          self.num_rows = 6
          self.length = 4
@@ -612,10 +602,10 @@ Before we go any further in this tutorial, it would be helpful to define and set
    # Define the network configuration
    NET_CONFIG = {
       "arch": "cnn",  # Network architecture
-      "h_size": [64, 64],  # Actor hidden size
-      "c_size": [128],  # CNN channel size
-      "k_size": [4],  # CNN kernel size
-      "s_size": [1],  # CNN stride size
+      "hidden_size": [64, 64],  # Actor hidden size
+      "channel_size": [128],  # CNN channel size
+      "kernel_size": [4],  # CNN kernel size
+      "stride_size": [1],  # CNN stride size
       "normalize": False,  # Normalize image from range [0,255] to [0,1]
    }
 
@@ -640,7 +630,6 @@ Before we go any further in this tutorial, it would be helpful to define and set
       "NUM_ATOMS": 51,  # Unit number of support
       "V_MIN": 0.0,  # Minimum value of support
       "V_MAX": 200.0,  # Maximum value of support
-      "WANDB": False,  # Use Weights and Biases tracking
    }
 
    # Define the connect four environment
@@ -667,7 +656,7 @@ Before we go any further in this tutorial, it would be helpful to define and set
    action_dim = action_dim[0]
 
    # Create a population ready for evolutionary hyper-parameter optimisation
-   pop = initialPopulation(
+   pop = create_population(
       INIT_HP["ALGO"],
       state_dim,
       action_dim,
@@ -681,7 +670,6 @@ Before we go any further in this tutorial, it would be helpful to define and set
    # Configure the replay buffer
    field_names = ["state", "action", "reward", "next_state", "done"]
    memory = ReplayBuffer(
-      action_dim=action_dim,  # Number of agent actions
       memory_size=INIT_HP["MEMORY_SIZE"],  # Max replay buffer size
       field_names=field_names,  # Field names to store in memory
       device=device,
@@ -692,8 +680,8 @@ Before we go any further in this tutorial, it would be helpful to define and set
       tournament_size=2,  # Tournament selection size
       elitism=True,  # Elitism in tournament selection
       population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-      evo_step=1,
-   )  # Evaluate using last N fitness scores
+      eval_loop=1,  # Evaluate using last N fitness scores
+   )
 
    # Instantiate a mutations object (used for HPO)
    mutations = Mutations(
@@ -733,7 +721,6 @@ Before we go any further in this tutorial, it would be helpful to define and set
    eps_end = 0.1  # Final epsilon value
    eps_decay = 0.9998  # Epsilon decays
    opp_update_counter = 0
-   wb = INIT_HP["WANDB"]
 
    ```
 </details>
@@ -745,6 +732,7 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran
 
    ```python
    # Perform buffer and agent warmups if desired
+   # Perform buffer and agent warmups if desired
    if LESSON["buffer_warm_up"]:
       warm_up_opponent = Opponent(env, difficulty=LESSON["warm_up_opponent"])
       memory = env.fill_replay_buffer(
@@ -763,6 +751,33 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran
    ```
 </details>
 
+The observation space of Connect Four is (6, 7, 2), where the first two dimensions represent the board and the third dimension represents the player. As PyTorch uses channels-first by default, we need to preprocess the observation. Moreover, we need to flip and swap the planes of the observation to account for the fact that the agent will play as both player 0 and player 1. We can define a function to do this as follows:
+
+<details>
+   <summary>Tansform and Flip</summary>
+
+   ```python
+   def transform_and_flip(observation, player):
+      """Transforms and flips observation for input to agent's neural network.
+
+      :param observation: Observation to preprocess
+      :type observation: dict[str, np.ndarray]
+      :param player: Player, 0 or 1
+      :type player: int
+      """
+      state = observation["observation"]
+      # Pre-process dimensions for PyTorch (N, C, H, W)
+      state = np.moveaxis(state, [-1], [-3])
+      if player == 1:
+         # Swap pieces so that the agent always sees the board from the same perspective
+         state[[0, 1], :, :] = state[[1, 0], :, :]
+      state_flipped = np.expand_dims(np.flip(state, 2), 0)
+      state = np.expand_dims(state, 0)
+      return state, state_flipped
+   ```
+</details>
+
+
 ### Self-play
 
 In this tutorial, we use self-play as the final lesson in our curriculum. By iteratively improving our agent and making it learn to win against itself, we can allow it to discover new strategies and achieve higher performance. The weights of our pretrained agent from an earlier lesson can be loaded to the population as follows:
@@ -774,7 +789,7 @@ In this tutorial, we use self-play as the final lesson in our curriculum. By ite
    if LESSON["pretrained_path"] is not None:
       for agent in pop:
             # Load pretrained checkpoint
-            agent.loadCheckpoint(LESSON["pretrained_path"])
+            agent.load_checkpoint(LESSON["pretrained_path"])
             # Reinit optimizer for new task
             agent.lr = INIT_HP["LR"]
             agent.optimizer = torch.optim.Adam(
@@ -824,24 +839,23 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
 
    ```python
    if max_episodes > 0:
-      if wb:
-         wandb.init(
-               # set the wandb project where this run will be logged
-               project="AgileRL",
-               name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
-                  "connect_four_v3",
-                  INIT_HP["ALGO"],
-                  LESSON["opponent"],
-                  datetime.now().strftime("%m%d%Y%H%M%S"),
-               ),
-               # track hyperparameters and run metadata
-               config={
-                  "algo": "Evo HPO Rainbow DQN",
-                  "env": "connect_four_v3",
-                  "INIT_HP": INIT_HP,
-                  "lesson": LESSON,
-               },
-         )
+      wandb.init(
+            # set the wandb project where this run will be logged
+            project="AgileRL",
+            name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
+               "connect_four_v3",
+               INIT_HP["ALGO"],
+               LESSON["opponent"],
+               datetime.now().strftime("%m%d%Y%H%M%S"),
+            ),
+            # track hyperparameters and run metadata
+            config={
+               "algo": "Evo HPO Rainbow DQN",
+               "env": "connect_four_v3",
+               "INIT_HP": INIT_HP,
+               "lesson": LESSON,
+            },
+      )
 
    total_steps = 0
    total_episodes = 0
@@ -854,7 +868,7 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
       for agent in pop:  # Loop through population
             for episode in range(episodes_per_epoch):
                env.reset()  # Reset environment at start of episode
-               observation, env_reward, done, truncation, _ = env.last()
+               observation, cumulative_reward, done, truncation, _ = env.last()
 
                (
                   p1_state,
@@ -883,23 +897,21 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                for idx_step in range(max_steps):
                   # Player 0"s turn
                   p0_action_mask = observation["action_mask"]
-                  p0_state = np.moveaxis(observation["observation"], [-1], [-3])
-                  p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
-                  p0_state = np.expand_dims(p0_state, 0)
+                  p0_state, p0_state_flipped = transform_and_flip(observation, player = 0)
 
                   if opponent_first:
                         if LESSON["opponent"] == "self":
-                           p0_action = opponent.getAction(
+                           p0_action = opponent.get_action(
                               p0_state, 0, p0_action_mask
                            )[0]
                         elif LESSON["opponent"] == "random":
-                           p0_action = opponent.getAction(
+                           p0_action = opponent.get_action(
                               p0_action_mask, p1_action, LESSON["block_vert_coef"]
                            )
                         else:
-                           p0_action = opponent.getAction(player=0)
+                           p0_action = opponent.get_action(player=0)
                   else:
-                        p0_action = agent.getAction(
+                        p0_action = agent.get_action(
                            p0_state, epsilon, p0_action_mask
                         )[
                            0
@@ -907,23 +919,18 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                         train_actions_hist[p0_action] += 1
 
                   env.step(p0_action)  # Act in environment
-                  observation, env_reward, done, truncation, _ = env.last()
-                  p0_next_state = np.moveaxis(
-                        observation["observation"], [-1], [-3]
+                  observation, cumulative_reward, done, truncation, _ = env.last()
+                  p0_next_state, p0_next_state_flipped = transform_and_flip(
+                        observation, player = 0
                   )
-                  p0_next_state_flipped = np.expand_dims(
-                        np.flip(p0_next_state, 2), 0
-                  )
-                  p0_next_state = np.expand_dims(p0_next_state, 0)
-
                   if not opponent_first:
-                        score += env_reward
+                        score = cumulative_reward
                   turns += 1
 
                   # Check if game is over (Player 0 win)
                   if done or truncation:
                         reward = env.reward(done=True, player=0)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                            np.concatenate(
                               (
                                     p0_state,
@@ -952,7 +959,7 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                   else:  # Play continues
                         if p1_state is not None:
                            reward = env.reward(done=False, player=1)
-                           memory.save2memoryVectEnvs(
+                           memory.save_to_memory_vect_envs(
                               np.concatenate((p1_state, p1_state_flipped)),
                               [p1_action, 6 - p1_action],
                               [reward, reward],
@@ -964,29 +971,23 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
 
                         # Player 1"s turn
                         p1_action_mask = observation["action_mask"]
-                        p1_state = np.moveaxis(
-                           observation["observation"], [-1], [-3]
-                        )
-                        # Swap pieces so that the agent always sees the board from the same perspective
-                        p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
-                        p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
-                        p1_state = np.expand_dims(p1_state, 0)
+                        p1_state, p1_state_flipped = transform_and_flip(observation, player = 1)
 
                         if not opponent_first:
                            if LESSON["opponent"] == "self":
-                              p1_action = opponent.getAction(
+                              p1_action = opponent.get_action(
                                     p1_state, 0, p1_action_mask
                               )[0]
                            elif LESSON["opponent"] == "random":
-                              p1_action = opponent.getAction(
+                              p1_action = opponent.get_action(
                                     p1_action_mask,
                                     p0_action,
                                     LESSON["block_vert_coef"],
                               )
                            else:
-                              p1_action = opponent.getAction(player=1)
+                              p1_action = opponent.get_action(player=1)
                         else:
-                           p1_action = agent.getAction(
+                           p1_action = agent.get_action(
                               p1_state, epsilon, p1_action_mask
                            )[
                               0
@@ -994,24 +995,19 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                            train_actions_hist[p1_action] += 1
 
                         env.step(p1_action)  # Act in environment
-                        observation, env_reward, done, truncation, _ = env.last()
-                        p1_next_state = np.moveaxis(
-                           observation["observation"], [-1], [-3]
-                        )
-                        p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
-                        p1_next_state_flipped = np.expand_dims(
-                           np.flip(p1_next_state, 2), 0
+                        observation, cumulative_reward, done, truncation, _ = env.last()
+                        p1_next_state, p1_next_state_flipped = transform_and_flip(
+                              observation, player = 1
                         )
-                        p1_next_state = np.expand_dims(p1_next_state, 0)
 
                         if opponent_first:
-                           score += env_reward
+                           score = cumulative_reward
                         turns += 1
 
                         # Check if game is over (Player 1 win)
                         if done or truncation:
                            reward = env.reward(done=True, player=1)
-                           memory.save2memoryVectEnvs(
+                           memory.save_to_memory_vect_envs(
                               np.concatenate(
                                     (
                                        p0_state,
@@ -1045,7 +1041,7 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
 
                         else:  # Play continues
                            reward = env.reward(done=False, player=0)
-                           memory.save2memoryVectEnvs(
+                           memory.save_to_memory_vect_envs(
                               np.concatenate((p0_state, p0_state_flipped)),
                               [p0_action, 6 - p0_action],
                               [reward, reward],
@@ -1100,7 +1096,7 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                   rewards = []
                   for i in range(evo_loop):
                         env.reset()  # Reset environment at start of episode
-                        observation, reward, done, truncation, _ = env.last()
+                        observation, cumulative_reward, done, truncation, _ = env.last()
 
                         player = -1  # Tracker for which player"s turn it is
 
@@ -1120,42 +1116,42 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                            if player < 0:
                               if opponent_first:
                                     if LESSON["eval_opponent"] == "random":
-                                       action = opponent.getAction(action_mask)
+                                       action = opponent.get_action(action_mask)
                                     else:
-                                       action = opponent.getAction(player=0)
+                                       action = opponent.get_action(player=0)
                               else:
                                     state = np.moveaxis(
                                        observation["observation"], [-1], [-3]
                                     )
                                     state = np.expand_dims(state, 0)
-                                    action = agent.getAction(state, 0, action_mask)[
+                                    action = agent.get_action(state, 0, action_mask)[
                                        0
                                     ]  # Get next action from agent
                                     eval_actions_hist[action] += 1
                            if player > 0:
                               if not opponent_first:
                                     if LESSON["eval_opponent"] == "random":
-                                       action = opponent.getAction(action_mask)
+                                       action = opponent.get_action(action_mask)
                                     else:
-                                       action = opponent.getAction(player=1)
+                                       action = opponent.get_action(player=1)
                               else:
                                     state = np.moveaxis(
                                        observation["observation"], [-1], [-3]
                                     )
-                                    state[[0, 1], :, :] = state[[0, 1], :, :]
+                                    state[[0, 1], :, :] = state[[1, 0], :, :]
                                     state = np.expand_dims(state, 0)
-                                    action = agent.getAction(state, 0, action_mask)[
+                                    action = agent.get_action(state, 0, action_mask)[
                                        0
                                     ]  # Get next action from agent
                                     eval_actions_hist[action] += 1
 
                            env.step(action)  # Act in environment
-                           observation, reward, done, truncation, _ = env.last()
+                           observation, cumulative_reward, done, truncation, _ = env.last()
 
                            if (player > 0 and opponent_first) or (
                               player < 0 and not opponent_first
                            ):
-                              score += reward
+                              score = cumulative_reward
 
                            eval_turns += 1
 
@@ -1192,34 +1188,34 @@ At regular intervals, we evaluate the performance, or 'fitness',  of the agents
                for index, action in enumerate(eval_actions_hist)
             }
 
-            if wb:
-               wandb_dict = {
-                  "global_step": total_steps,
-                  "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
-                  "train/mean_turns_per_game": mean_turns,
-                  "train/epsilon": epsilon,
-                  "train/opponent_updates": opp_update_counter,
-                  "eval/mean_fitness": np.mean(fitnesses),
-                  "eval/best_fitness": np.max(fitnesses),
-                  "eval/mean_turns_per_game": eval_turns,
-               }
-               wandb_dict.update(train_actions_dict)
-               wandb_dict.update(eval_actions_dict)
-               wandb.log(wandb_dict)
+            wandb_dict = {
+               "global_step": total_steps,
+               "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
+               "train/mean_turns_per_game": mean_turns,
+               "train/epsilon": epsilon,
+               "train/opponent_updates": opp_update_counter,
+               "eval/mean_fitness": np.mean(fitnesses),
+               "eval/best_fitness": np.max(fitnesses),
+               "eval/mean_turns_per_game": eval_turns,
+            }
+            wandb_dict.update(train_actions_dict)
+            wandb_dict.update(eval_actions_dict)
+            wandb.log(wandb_dict)
 
             # Tournament selection and population mutation
             elite, pop = tournament.select(pop)
             pop = mutations.mutation(pop)
 
    if max_episodes > 0:
-      if wb:
-         wandb.finish()
+      wandb.finish()
 
    # Save the trained agent
    save_path = LESSON["save_path"]
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-   elite.saveCheckpoint(save_path)
+   elite.save_checkpoint(save_path)
    print(f"Elite agent saved to '{save_path}'.")
+
+   pbar.close()
    ```
 </details>
 
diff --git a/docs/tutorials/agilerl/MADDPG.md b/docs/tutorials/agilerl/MADDPG.md
index bc6c52e8b..7052b8b1a 100644
--- a/docs/tutorials/agilerl/MADDPG.md
+++ b/docs/tutorials/agilerl/MADDPG.md
@@ -21,7 +21,7 @@ To follow this tutorial, you will need to install the dependencies shown below.
 ```
 
 ## Code
-### Train multiple agents using MADDPG
+### Train agents using MADDPG
 The following code should run without any issues. The comments are designed to help you understand how to use PettingZoo with AgileRL. If you have any questions, please feel free to ask in the [Discord server](https://discord.com/invite/eB8HyTA2ux).
 
 ```{eval-rst}
diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
index 1b6e86949..6b9ec9770 100644
--- a/tutorials/AgileRL/agilerl_dqn_curriculum.py
+++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py
@@ -2,6 +2,7 @@
 
 Author: Nick (https://github.com/nicku-a)
 """
+
 import copy
 import os
 import random
@@ -15,7 +16,7 @@
 from agilerl.components.replay_buffer import ReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
 from tqdm import tqdm, trange
 
 from pettingzoo.classic import connect_four_v3
@@ -66,27 +67,25 @@ def fill_replay_buffer(self, memory, opponent):
             while not (done or truncation):
                 # Player 0's turn
                 p0_action_mask = observation["action_mask"]
-                p0_state = np.moveaxis(observation["observation"], [-1], [-3])
-                p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
-                p0_state = np.expand_dims(p0_state, 0)
+                p0_state, p0_state_flipped = transform_and_flip(observation, player=0)
                 if opponent_first:
                     p0_action = self.env.action_space("player_0").sample(p0_action_mask)
                 else:
                     if self.lesson["warm_up_opponent"] == "random":
-                        p0_action = opponent.getAction(
+                        p0_action = opponent.get_action(
                             p0_action_mask, p1_action, self.lesson["block_vert_coef"]
                         )
                     else:
-                        p0_action = opponent.getAction(player=0)
+                        p0_action = opponent.get_action(player=0)
                 self.step(p0_action)  # Act in environment
                 observation, env_reward, done, truncation, _ = self.last()
-                p0_next_state = np.moveaxis(observation["observation"], [-1], [-3])
-                p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0)
-                p0_next_state = np.expand_dims(p0_next_state, 0)
+                p0_next_state, p0_next_state_flipped = transform_and_flip(
+                    observation, player=0
+                )
 
                 if done or truncation:
                     reward = self.reward(done=True, player=0)
-                    memory.save2memoryVectEnvs(
+                    memory.save_to_memory_vect_envs(
                         np.concatenate(
                             (p0_state, p1_state, p0_state_flipped, p1_state_flipped)
                         ),
@@ -110,7 +109,7 @@ def fill_replay_buffer(self, memory, opponent):
                 else:  # Play continues
                     if p1_state is not None:
                         reward = self.reward(done=False, player=1)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                             np.concatenate((p1_state, p1_state_flipped)),
                             [p1_action, 6 - p1_action],
                             [reward, reward],
@@ -120,31 +119,29 @@ def fill_replay_buffer(self, memory, opponent):
 
                     # Player 1's turn
                     p1_action_mask = observation["action_mask"]
-                    p1_state = np.moveaxis(observation["observation"], [-1], [-3])
-                    p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
-                    p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
-                    p1_state = np.expand_dims(p1_state, 0)
+                    p1_state, p1_state_flipped = transform_and_flip(
+                        observation, player=1
+                    )
                     if not opponent_first:
                         p1_action = self.env.action_space("player_1").sample(
                             p1_action_mask
                         )
                     else:
                         if self.lesson["warm_up_opponent"] == "random":
-                            p1_action = opponent.getAction(
+                            p1_action = opponent.get_action(
                                 p1_action_mask, p0_action, LESSON["block_vert_coef"]
                             )
                         else:
-                            p1_action = opponent.getAction(player=1)
+                            p1_action = opponent.get_action(player=1)
                     self.step(p1_action)  # Act in environment
                     observation, env_reward, done, truncation, _ = self.last()
-                    p1_next_state = np.moveaxis(observation["observation"], [-1], [-3])
-                    p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
-                    p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0)
-                    p1_next_state = np.expand_dims(p1_next_state, 0)
+                    p1_next_state, p1_next_state_flipped = transform_and_flip(
+                        observation, player=1
+                    )
 
                     if done or truncation:
                         reward = self.reward(done=True, player=1)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                             np.concatenate(
                                 (p0_state, p1_state, p0_state_flipped, p1_state_flipped)
                             ),
@@ -168,7 +165,7 @@ def fill_replay_buffer(self, memory, opponent):
 
                     else:  # Play continues
                         reward = self.reward(done=False, player=0)
-                        memory.save2memoryVectEnvs(
+                        memory.save_to_memory_vect_envs(
                             np.concatenate((p0_state, p0_state_flipped)),
                             [p0_action, 6 - p0_action],
                             [reward, reward],
@@ -323,11 +320,11 @@ def __init__(self, env, difficulty):
         self.env = env.env
         self.difficulty = difficulty
         if self.difficulty == "random":
-            self.getAction = self.random_opponent
+            self.get_action = self.random_opponent
         elif self.difficulty == "weak":
-            self.getAction = self.weak_rule_based_opponent
+            self.get_action = self.weak_rule_based_opponent
         else:
-            self.getAction = self.strong_rule_based_opponent
+            self.get_action = self.strong_rule_based_opponent
         self.num_cols = 7
         self.num_rows = 6
         self.length = 4
@@ -482,6 +479,25 @@ def outcome(self, action, player, return_length=False):
         return (True, reward, ended) + ((lengths,) if return_length else ())
 
 
+def transform_and_flip(observation, player):
+    """Transforms and flips observation for input to agent's neural network.
+
+    :param observation: Observation to preprocess
+    :type observation: dict[str, np.ndarray]
+    :param player: Player, 0 or 1
+    :type player: int
+    """
+    state = observation["observation"]
+    # Pre-process dimensions for PyTorch (N, C, H, W)
+    state = np.moveaxis(state, [-1], [-3])
+    if player == 1:
+        # Swap pieces so that the agent always sees the board from the same perspective
+        state[[0, 1], :, :] = state[[1, 0], :, :]
+    state_flipped = np.expand_dims(np.flip(state, 2), 0)
+    state = np.expand_dims(state, 0)
+    return state, state_flipped
+
+
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("===== AgileRL Curriculum Learning Demo =====")
@@ -494,10 +510,10 @@ def outcome(self, action, player, return_length=False):
         # Define the network configuration
         NET_CONFIG = {
             "arch": "cnn",  # Network architecture
-            "h_size": [64, 64],  # Actor hidden size
-            "c_size": [128],  # CNN channel size
-            "k_size": [4],  # CNN kernel size
-            "s_size": [1],  # CNN stride size
+            "hidden_size": [64, 64],  # Actor hidden size
+            "channel_size": [128],  # CNN channel size
+            "kernel_size": [4],  # CNN kernel size
+            "stride_size": [1],  # CNN stride size
             "normalize": False,  # Normalize image from range [0,255] to [0,1]
         }
 
@@ -522,7 +538,6 @@ def outcome(self, action, player, return_length=False):
             "NUM_ATOMS": 51,  # Unit number of support
             "V_MIN": 0.0,  # Minimum value of support
             "V_MAX": 200.0,  # Maximum value of support
-            "WANDB": False,  # Use Weights and Biases tracking
         }
 
         # Define the connect four environment
@@ -549,7 +564,7 @@ def outcome(self, action, player, return_length=False):
         action_dim = action_dim[0]
 
         # Create a population ready for evolutionary hyper-parameter optimisation
-        pop = initialPopulation(
+        pop = create_population(
             INIT_HP["ALGO"],
             state_dim,
             action_dim,
@@ -563,7 +578,6 @@ def outcome(self, action, player, return_length=False):
         # Configure the replay buffer
         field_names = ["state", "action", "reward", "next_state", "done"]
         memory = ReplayBuffer(
-            action_dim=action_dim,  # Number of agent actions
             memory_size=INIT_HP["MEMORY_SIZE"],  # Max replay buffer size
             field_names=field_names,  # Field names to store in memory
             device=device,
@@ -574,8 +588,8 @@ def outcome(self, action, player, return_length=False):
             tournament_size=2,  # Tournament selection size
             elitism=True,  # Elitism in tournament selection
             population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-            evo_step=1,
-        )  # Evaluate using last N fitness scores
+            eval_loop=1,  # Evaluate using last N fitness scores
+        )
 
         # Instantiate a mutations object (used for HPO)
         mutations = Mutations(
@@ -606,12 +620,7 @@ def outcome(self, action, player, return_length=False):
 
         # Define training loop parameters
         episodes_per_epoch = 10
-
-        # ! NOTE: Uncomment the max_episodes line below to change the number of training episodes. ! #
-        # It is deliberately set low to allow testing to ensure this tutorial is sound.
-        max_episodes = 10
-        # max_episodes = LESSON["max_train_episodes"]  # Total episodes
-
+        max_episodes = LESSON["max_train_episodes"]  # Total episodes
         max_steps = 500  # Maximum steps to take in each episode
         evo_epochs = 20  # Evolution frequency
         evo_loop = 50  # Number of evaluation episodes
@@ -620,12 +629,11 @@ def outcome(self, action, player, return_length=False):
         eps_end = 0.1  # Final epsilon value
         eps_decay = 0.9998  # Epsilon decays
         opp_update_counter = 0
-        wb = INIT_HP["WANDB"]
 
         if LESSON["pretrained_path"] is not None:
             for agent in pop:
                 # Load pretrained checkpoint
-                agent.loadCheckpoint(LESSON["pretrained_path"])
+                agent.load_checkpoint(LESSON["pretrained_path"])
                 # Reinit optimizer for new task
                 agent.lr = INIT_HP["LR"]
                 agent.optimizer = torch.optim.Adam(
@@ -659,24 +667,23 @@ def outcome(self, action, player, return_length=False):
                 print("Agent population warmed up.")
 
         if max_episodes > 0:
-            if wb:
-                wandb.init(
-                    # set the wandb project where this run will be logged
-                    project="AgileRL",
-                    name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
-                        "connect_four_v3",
-                        INIT_HP["ALGO"],
-                        LESSON["opponent"],
-                        datetime.now().strftime("%m%d%Y%H%M%S"),
-                    ),
-                    # track hyperparameters and run metadata
-                    config={
-                        "algo": "Evo HPO Rainbow DQN",
-                        "env": "connect_four_v3",
-                        "INIT_HP": INIT_HP,
-                        "lesson": LESSON,
-                    },
-                )
+            wandb.init(
+                # set the wandb project where this run will be logged
+                project="AgileRL",
+                name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
+                    "connect_four_v3",
+                    INIT_HP["ALGO"],
+                    LESSON["opponent"],
+                    datetime.now().strftime("%m%d%Y%H%M%S"),
+                ),
+                # track hyperparameters and run metadata
+                config={
+                    "algo": "Evo HPO Rainbow DQN",
+                    "env": "connect_four_v3",
+                    "INIT_HP": INIT_HP,
+                    "lesson": LESSON,
+                },
+            )
 
         total_steps = 0
         total_episodes = 0
@@ -689,7 +696,7 @@ def outcome(self, action, player, return_length=False):
             for agent in pop:  # Loop through population
                 for episode in range(episodes_per_epoch):
                     env.reset()  # Reset environment at start of episode
-                    observation, env_reward, done, truncation, _ = env.last()
+                    observation, cumulative_reward, done, truncation, _ = env.last()
 
                     (
                         p1_state,
@@ -718,23 +725,23 @@ def outcome(self, action, player, return_length=False):
                     for idx_step in range(max_steps):
                         # Player 0"s turn
                         p0_action_mask = observation["action_mask"]
-                        p0_state = np.moveaxis(observation["observation"], [-1], [-3])
-                        p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
-                        p0_state = np.expand_dims(p0_state, 0)
+                        p0_state, p0_state_flipped = transform_and_flip(
+                            observation, player=0
+                        )
 
                         if opponent_first:
                             if LESSON["opponent"] == "self":
-                                p0_action = opponent.getAction(
+                                p0_action = opponent.get_action(
                                     p0_state, 0, p0_action_mask
                                 )[0]
                             elif LESSON["opponent"] == "random":
-                                p0_action = opponent.getAction(
+                                p0_action = opponent.get_action(
                                     p0_action_mask, p1_action, LESSON["block_vert_coef"]
                                 )
                             else:
-                                p0_action = opponent.getAction(player=0)
+                                p0_action = opponent.get_action(player=0)
                         else:
-                            p0_action = agent.getAction(
+                            p0_action = agent.get_action(
                                 p0_state, epsilon, p0_action_mask
                             )[
                                 0
@@ -742,23 +749,18 @@ def outcome(self, action, player, return_length=False):
                             train_actions_hist[p0_action] += 1
 
                         env.step(p0_action)  # Act in environment
-                        observation, env_reward, done, truncation, _ = env.last()
-                        p0_next_state = np.moveaxis(
-                            observation["observation"], [-1], [-3]
+                        observation, cumulative_reward, done, truncation, _ = env.last()
+                        p0_next_state, p0_next_state_flipped = transform_and_flip(
+                            observation, player=0
                         )
-                        p0_next_state_flipped = np.expand_dims(
-                            np.flip(p0_next_state, 2), 0
-                        )
-                        p0_next_state = np.expand_dims(p0_next_state, 0)
-
                         if not opponent_first:
-                            score += env_reward
+                            score = cumulative_reward
                         turns += 1
 
                         # Check if game is over (Player 0 win)
                         if done or truncation:
                             reward = env.reward(done=True, player=0)
-                            memory.save2memoryVectEnvs(
+                            memory.save_to_memory_vect_envs(
                                 np.concatenate(
                                     (
                                         p0_state,
@@ -787,7 +789,7 @@ def outcome(self, action, player, return_length=False):
                         else:  # Play continues
                             if p1_state is not None:
                                 reward = env.reward(done=False, player=1)
-                                memory.save2memoryVectEnvs(
+                                memory.save_to_memory_vect_envs(
                                     np.concatenate((p1_state, p1_state_flipped)),
                                     [p1_action, 6 - p1_action],
                                     [reward, reward],
@@ -799,29 +801,25 @@ def outcome(self, action, player, return_length=False):
 
                             # Player 1"s turn
                             p1_action_mask = observation["action_mask"]
-                            p1_state = np.moveaxis(
-                                observation["observation"], [-1], [-3]
+                            p1_state, p1_state_flipped = transform_and_flip(
+                                observation, player=1
                             )
-                            # Swap pieces so that the agent always sees the board from the same perspective
-                            p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
-                            p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
-                            p1_state = np.expand_dims(p1_state, 0)
 
                             if not opponent_first:
                                 if LESSON["opponent"] == "self":
-                                    p1_action = opponent.getAction(
+                                    p1_action = opponent.get_action(
                                         p1_state, 0, p1_action_mask
                                     )[0]
                                 elif LESSON["opponent"] == "random":
-                                    p1_action = opponent.getAction(
+                                    p1_action = opponent.get_action(
                                         p1_action_mask,
                                         p0_action,
                                         LESSON["block_vert_coef"],
                                     )
                                 else:
-                                    p1_action = opponent.getAction(player=1)
+                                    p1_action = opponent.get_action(player=1)
                             else:
-                                p1_action = agent.getAction(
+                                p1_action = agent.get_action(
                                     p1_state, epsilon, p1_action_mask
                                 )[
                                     0
@@ -829,24 +827,25 @@ def outcome(self, action, player, return_length=False):
                                 train_actions_hist[p1_action] += 1
 
                             env.step(p1_action)  # Act in environment
-                            observation, env_reward, done, truncation, _ = env.last()
-                            p1_next_state = np.moveaxis(
-                                observation["observation"], [-1], [-3]
-                            )
-                            p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
-                            p1_next_state_flipped = np.expand_dims(
-                                np.flip(p1_next_state, 2), 0
+                            (
+                                observation,
+                                cumulative_reward,
+                                done,
+                                truncation,
+                                _,
+                            ) = env.last()
+                            p1_next_state, p1_next_state_flipped = transform_and_flip(
+                                observation, player=1
                             )
-                            p1_next_state = np.expand_dims(p1_next_state, 0)
 
                             if opponent_first:
-                                score += env_reward
+                                score = cumulative_reward
                             turns += 1
 
                             # Check if game is over (Player 1 win)
                             if done or truncation:
                                 reward = env.reward(done=True, player=1)
-                                memory.save2memoryVectEnvs(
+                                memory.save_to_memory_vect_envs(
                                     np.concatenate(
                                         (
                                             p0_state,
@@ -880,7 +879,7 @@ def outcome(self, action, player, return_length=False):
 
                             else:  # Play continues
                                 reward = env.reward(done=False, player=0)
-                                memory.save2memoryVectEnvs(
+                                memory.save_to_memory_vect_envs(
                                     np.concatenate((p0_state, p0_state_flipped)),
                                     [p0_action, 6 - p0_action],
                                     [reward, reward],
@@ -935,7 +934,13 @@ def outcome(self, action, player, return_length=False):
                         rewards = []
                         for i in range(evo_loop):
                             env.reset()  # Reset environment at start of episode
-                            observation, reward, done, truncation, _ = env.last()
+                            (
+                                observation,
+                                cumulative_reward,
+                                done,
+                                truncation,
+                                _,
+                            ) = env.last()
 
                             player = -1  # Tracker for which player"s turn it is
 
@@ -955,42 +960,52 @@ def outcome(self, action, player, return_length=False):
                                 if player < 0:
                                     if opponent_first:
                                         if LESSON["eval_opponent"] == "random":
-                                            action = opponent.getAction(action_mask)
+                                            action = opponent.get_action(action_mask)
                                         else:
-                                            action = opponent.getAction(player=0)
+                                            action = opponent.get_action(player=0)
                                     else:
                                         state = np.moveaxis(
                                             observation["observation"], [-1], [-3]
                                         )
                                         state = np.expand_dims(state, 0)
-                                        action = agent.getAction(state, 0, action_mask)[
+                                        action = agent.get_action(
+                                            state, 0, action_mask
+                                        )[
                                             0
                                         ]  # Get next action from agent
                                         eval_actions_hist[action] += 1
                                 if player > 0:
                                     if not opponent_first:
                                         if LESSON["eval_opponent"] == "random":
-                                            action = opponent.getAction(action_mask)
+                                            action = opponent.get_action(action_mask)
                                         else:
-                                            action = opponent.getAction(player=1)
+                                            action = opponent.get_action(player=1)
                                     else:
                                         state = np.moveaxis(
                                             observation["observation"], [-1], [-3]
                                         )
-                                        state[[0, 1], :, :] = state[[0, 1], :, :]
+                                        state[[0, 1], :, :] = state[[1, 0], :, :]
                                         state = np.expand_dims(state, 0)
-                                        action = agent.getAction(state, 0, action_mask)[
+                                        action = agent.get_action(
+                                            state, 0, action_mask
+                                        )[
                                             0
                                         ]  # Get next action from agent
                                         eval_actions_hist[action] += 1
 
                                 env.step(action)  # Act in environment
-                                observation, reward, done, truncation, _ = env.last()
+                                (
+                                    observation,
+                                    cumulative_reward,
+                                    done,
+                                    truncation,
+                                    _,
+                                ) = env.last()
 
                                 if (player > 0 and opponent_first) or (
                                     player < 0 and not opponent_first
                                 ):
-                                    score += reward
+                                    score = cumulative_reward
 
                                 eval_turns += 1
 
@@ -1027,31 +1042,29 @@ def outcome(self, action, player, return_length=False):
                     for index, action in enumerate(eval_actions_hist)
                 }
 
-                if wb:
-                    wandb_dict = {
-                        "global_step": total_steps,
-                        "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
-                        "train/mean_turns_per_game": mean_turns,
-                        "train/epsilon": epsilon,
-                        "train/opponent_updates": opp_update_counter,
-                        "eval/mean_fitness": np.mean(fitnesses),
-                        "eval/best_fitness": np.max(fitnesses),
-                        "eval/mean_turns_per_game": eval_turns,
-                    }
-                    wandb_dict.update(train_actions_dict)
-                    wandb_dict.update(eval_actions_dict)
-                    wandb.log(wandb_dict)
+                wandb_dict = {
+                    "global_step": total_steps,
+                    "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
+                    "train/mean_turns_per_game": mean_turns,
+                    "train/epsilon": epsilon,
+                    "train/opponent_updates": opp_update_counter,
+                    "eval/mean_fitness": np.mean(fitnesses),
+                    "eval/best_fitness": np.max(fitnesses),
+                    "eval/mean_turns_per_game": eval_turns,
+                }
+                wandb_dict.update(train_actions_dict)
+                wandb_dict.update(eval_actions_dict)
+                wandb.log(wandb_dict)
 
                 # Tournament selection and population mutation
                 elite, pop = tournament.select(pop)
                 pop = mutations.mutation(pop)
 
         if max_episodes > 0:
-            if wb:
-                wandb.finish()
+            wandb.finish()
 
         # Save the trained agent
         save_path = LESSON["save_path"]
         os.makedirs(os.path.dirname(save_path), exist_ok=True)
-        elite.saveCheckpoint(save_path)
+        elite.save_checkpoint(save_path)
         print(f"Elite agent saved to '{save_path}'.")
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index 14a93f6e1..99d19e17c 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -2,76 +2,88 @@
 
 Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a)
 """
+
 import os
+from copy import deepcopy
 
 import numpy as np
 import supersuit as ss
 import torch
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
-from agilerl.hpo.mutation import Mutations
-from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
+from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
 from tqdm import trange
 
 from pettingzoo.atari import space_invaders_v2
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("===== AgileRL MADDPG Demo =====")
 
     # Define the network configuration
     NET_CONFIG = {
         "arch": "cnn",  # Network architecture
-        "h_size": [32, 32],  # Network hidden size
-        "c_size": [32, 32],  # CNN channel size
-        "k_size": [3, 3],  # CNN kernel size
-        "s_size": [2, 2],  # CNN stride size
+        "hidden_size": [32, 32],  # Network hidden size
+        "channel_size": [32, 32],  # CNN channel size
+        "kernel_size": [3, 3],  # CNN kernel size
+        "stride_size": [2, 2],  # CNN stride size
         "normalize": True,  # Normalize image from range [0,255] to [0,1]
     }
 
     # Define the initial hyperparameters
     INIT_HP = {
-        "POPULATION_SIZE": 2,
+        "POPULATION_SIZE": 1,
         "ALGO": "MADDPG",  # Algorithm
         # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
         "CHANNELS_LAST": True,
-        "BATCH_SIZE": 8,  # Batch size
+        "BATCH_SIZE": 32,  # Batch size
+        "O_U_NOISE": True,  # Ornstein Uhlenbeck action noise
+        "EXPL_NOISE": 0.1,  # Action noise scale
+        "MEAN_NOISE": 0.0,  # Mean action noise
+        "THETA": 0.15,  # Rate of mean reversion in OU noise
+        "DT": 0.01,  # Timestep for OU noise
         "LR_ACTOR": 0.001,  # Actor learning rate
-        "LR_CRITIC": 0.01,  # Critic learning rate
+        "LR_CRITIC": 0.001,  # Critic learning rate
         "GAMMA": 0.95,  # Discount factor
-        "MEMORY_SIZE": 10000,  # Max memory buffer size
-        "LEARN_STEP": 5,  # Learning frequency
+        "MEMORY_SIZE": 100000,  # Max memory buffer size
+        "LEARN_STEP": 100,  # Learning frequency
         "TAU": 0.01,  # For soft update of target parameters
     }
 
+    num_envs = 8
     # Define the space invaders environment as a parallel environment
     env = space_invaders_v2.parallel_env()
-    if INIT_HP["CHANNELS_LAST"]:
-        # Environment processing for image based observations
-        env = ss.frame_skip_v0(env, 4)
-        env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1)
-        env = ss.color_reduction_v0(env, mode="B")
-        env = ss.resize_v1(env, x_size=84, y_size=84)
-        env = ss.frame_stack_v1(env, 4)
+
+    # Environment processing for image based observations
+    env = ss.frame_skip_v0(env, 4)
+    env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1)
+    env = ss.color_reduction_v0(env, mode="B")
+    env = ss.resize_v1(env, x_size=84, y_size=84)
+    env = ss.frame_stack_v1(env, 4)
+    env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
+
     env.reset()
 
     # Configure the multi-agent algo input arguments
     try:
-        state_dim = [env.observation_space(agent).n for agent in env.agents]
+        state_dim = [env.single_observation_space(agent).n for agent in env.agents]
         one_hot = True
     except Exception:
-        state_dim = [env.observation_space(agent).shape for agent in env.agents]
+        state_dim = [env.single_observation_space(agent).shape for agent in env.agents]
         one_hot = False
     try:
-        action_dim = [env.action_space(agent).n for agent in env.agents]
+        action_dim = [env.single_action_space(agent).n for agent in env.agents]
         INIT_HP["DISCRETE_ACTIONS"] = True
         INIT_HP["MAX_ACTION"] = None
         INIT_HP["MIN_ACTION"] = None
     except Exception:
-        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
+        action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents]
         INIT_HP["DISCRETE_ACTIONS"] = False
-        INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
-        INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
+        INIT_HP["MAX_ACTION"] = [
+            env.single_action_space(agent).high for agent in env.agents
+        ]
+        INIT_HP["MIN_ACTION"] = [
+            env.single_action_space(agent).low for agent in env.agents
+        ]
 
     # Pre-process image dimensions for pytorch convolutional layers
     if INIT_HP["CHANNELS_LAST"]:
@@ -84,7 +96,7 @@
     INIT_HP["AGENT_IDS"] = env.agents
 
     # Create a population ready for evolutionary hyper-parameter optimisation
-    pop = initialPopulation(
+    agent = create_population(
         INIT_HP["ALGO"],
         state_dim,
         action_dim,
@@ -92,8 +104,9 @@
         NET_CONFIG,
         INIT_HP,
         population_size=INIT_HP["POPULATION_SIZE"],
+        num_envs=num_envs,
         device=device,
-    )
+    )[0]
 
     # Configure the multi-agent replay buffer
     field_names = ["state", "action", "reward", "next_state", "done"]
@@ -104,152 +117,138 @@
         device=device,
     )
 
-    # Instantiate a tournament selection object (used for HPO)
-    tournament = TournamentSelection(
-        tournament_size=2,  # Tournament selection size
-        elitism=True,  # Elitism in tournament selection
-        population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-        evo_step=1,
-    )  # Evaluate using last N fitness scores
-
-    # Instantiate a mutations object (used for HPO)
-    mutations = Mutations(
-        algo=INIT_HP["ALGO"],
-        no_mutation=0.2,  # Probability of no mutation
-        architecture=0.2,  # Probability of architecture mutation
-        new_layer_prob=0.2,  # Probability of new layer mutation
-        parameters=0.2,  # Probability of parameter mutation
-        activation=0,  # Probability of activation function mutation
-        rl_hp=0.2,  # Probability of RL hyperparameter mutation
-        rl_hp_selection=[
-            "lr",
-            "learn_step",
-            "batch_size",
-        ],  # RL hyperparams selected for mutation
-        mutation_sd=0.1,  # Mutation strength
-        # Define search space for each hyperparameter
-        min_lr=0.0001,
-        max_lr=0.01,
-        min_learn_step=1,
-        max_learn_step=120,
-        min_batch_size=8,
-        max_batch_size=64,
-        agent_ids=INIT_HP["AGENT_IDS"],  # Agent IDs
-        arch=NET_CONFIG["arch"],  # MLP or CNN
-        rand_seed=1,
-        device=device,
-    )
-
     # Define training loop parameters
-    max_episodes = 5  # Total episodes (default: 6000)
-    max_steps = 900  # Maximum steps to take in each episode
-    epsilon = 1.0  # Starting epsilon value
-    eps_end = 0.1  # Final epsilon value
-    eps_decay = 0.995  # Epsilon decay
-    evo_epochs = 20  # Evolution frequency
-    evo_loop = 1  # Number of evaluation episodes
-    elite = pop[0]  # Assign a placeholder "elite" agent
-
-    # Training loop
-    for idx_epi in trange(max_episodes):
-        for agent in pop:  # Loop through population
-            state, info = env.reset()  # Reset environment at start of episode
-            agent_reward = {agent_id: 0 for agent_id in env.agents}
+    agent_ids = deepcopy(env.agents)
+    max_steps = 20000  # Max steps (default: 2000000)
+    learning_delay = 500  # Steps before starting learning
+    training_steps = 10000  # Frequency at which we evaluate training score
+    eval_steps = None  # Evaluation steps per episode - go until done
+    eval_loop = 1  # Number of evaluation episodes
+
+    total_steps = 0
+
+    # TRAINING LOOP
+    print("Training...")
+    pbar = trange(max_steps, unit="step")
+    while np.less(agent.steps[-1], max_steps):
+        state, info = env.reset()  # Reset environment at start of episode
+        scores = np.zeros((num_envs, len(agent_ids)))
+        completed_episode_scores = []
+        steps = 0
+        if INIT_HP["CHANNELS_LAST"]:
+            state = {
+                agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items()
+            }
+
+        for idx_step in range(training_steps // num_envs):
+            # Get next action from agent
+            cont_actions, discrete_action = agent.get_action(
+                states=state, training=True, infos=info
+            )
+            if agent.discrete_actions:
+                action = discrete_action
+            else:
+                action = cont_actions
+
+            # Act in environment
+            action = {agent: env.action_space(agent).sample() for agent in env.agents}
+            next_state, reward, termination, truncation, info = env.step(action)
+            if not termination:
+                assert False
+            scores += np.array(list(reward.values())).transpose()
+            total_steps += num_envs
+            steps += num_envs
+
+            # Image processing if necessary for the environment
             if INIT_HP["CHANNELS_LAST"]:
-                state = {
-                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
-                    for agent_id, s in state.items()
+                next_state = {
+                    agent_id: np.moveaxis(ns, [-1], [-3])
+                    for agent_id, ns in next_state.items()
                 }
-            for _ in range(max_steps):
-                agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
-                env_defined_actions = (
-                    info["env_defined_actions"]
-                    if "env_defined_actions" in info.keys()
-                    else None
-                )
-
-                # Get next action from agent
-                cont_actions, discrete_action = agent.getAction(
-                    state, epsilon, agent_mask, env_defined_actions
-                )
-                if agent.discrete_actions:
-                    action = discrete_action
-                else:
-                    action = cont_actions
-
-                next_state, reward, termination, truncation, info = env.step(
-                    action
-                )  # Act in environment
-
-                # Image processing if necessary for the environment
-                if INIT_HP["CHANNELS_LAST"]:
-                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
-                    next_state = {
-                        agent_id: np.moveaxis(ns, [-1], [-3])
-                        for agent_id, ns in next_state.items()
-                    }
-
-                # Save experiences to replay buffer
-                memory.save2memory(state, cont_actions, reward, next_state, termination)
-
-                # Collect the reward
-                for agent_id, r in reward.items():
-                    agent_reward[agent_id] += r
-
-                # Learn according to learning frequency
-                if (memory.counter % agent.learn_step == 0) and (
-                    len(memory) >= agent.batch_size
+
+            # Save experiences to replay buffer
+            memory.save_to_memory(
+                state,
+                cont_actions,
+                reward,
+                next_state,
+                termination,
+                is_vectorised=True,
+            )
+
+            # Learn according to learning frequency
+            # Handle learn steps > num_envs
+            if agent.learn_step > num_envs:
+                learn_step = agent.learn_step // num_envs
+                if (
+                    idx_step % learn_step == 0
+                    and len(memory) >= agent.batch_size
+                    and memory.counter > learning_delay
                 ):
-                    experiences = memory.sample(
-                        agent.batch_size
-                    )  # Sample replay buffer
-                    agent.learn(experiences)  # Learn according to agent's RL algorithm
-
-                # Update the state
-                if INIT_HP["CHANNELS_LAST"]:
-                    next_state = {
-                        agent_id: np.expand_dims(ns, 0)
-                        for agent_id, ns in next_state.items()
-                    }
-                state = next_state
-
-                # Stop episode if any agents have terminated
-                if any(truncation.values()) or any(termination.values()):
-                    break
-
-            # Save the total episode reward
-            score = sum(agent_reward.values())
-            agent.scores.append(score)
-
-        # Update epsilon for exploration
-        epsilon = max(eps_end, epsilon * eps_decay)
-
-        # Now evolve population if necessary
-        if (idx_epi + 1) % evo_epochs == 0:
-            # Evaluate population
-            fitnesses = [
-                agent.test(
-                    env,
-                    swap_channels=INIT_HP["CHANNELS_LAST"],
-                    max_steps=max_steps,
-                    loop=evo_loop,
-                )
-                for agent in pop
-            ]
-
-            print(f"Episode {idx_epi + 1}/{max_episodes}")
-            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
+                    # Sample replay buffer
+                    experiences = memory.sample(agent.batch_size)
+                    # Learn according to agent's RL algorithm
+                    agent.learn(experiences)
+            # Handle num_envs > learn step; learn multiple times per step in env
+            elif len(memory) >= agent.batch_size and memory.counter > learning_delay:
+                for _ in range(num_envs // agent.learn_step):
+                    # Sample replay buffer
+                    experiences = memory.sample(agent.batch_size)
+                    # Learn according to agent's RL algorithm
+                    agent.learn(experiences)
+
+            state = next_state
+
+            # Calculate scores and reset noise for finished episodes
+            reset_noise_indices = []
+            term_array = np.array(list(termination.values())).transpose()
+            trunc_array = np.array(list(truncation.values())).transpose()
+            for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+                if np.any(d) or np.any(t):
+                    completed_episode_scores.append(scores[idx])
+                    agent.scores.append(scores[idx])
+                    scores[idx] = 0
+                    reset_noise_indices.append(idx)
+            agent.reset_action_noise(reset_noise_indices)
+
+        pbar.update(training_steps)
+
+        agent.steps[-1] += steps
+
+        # Evaluate population
+        fitness = agent.test(
+            env,
+            swap_channels=INIT_HP["CHANNELS_LAST"],
+            max_steps=eval_steps,
+            loop=eval_loop,
+            sum_scores=False,
+        )
+        pop_episode_scores = np.array(completed_episode_scores)
+        mean_scores = np.mean(pop_episode_scores, axis=0)
+
+        print(f"--- Global steps {total_steps} ---")
+        print(f"Steps {agent.steps[-1]}")
+        print("Scores:")
+        for idx, sub_agent in enumerate(agent_ids):
+            print(f"    {sub_agent} score: {mean_scores[idx]}")
+        print("Fitness")
+        for idx, sub_agent in enumerate(agent_ids):
+            print(f"    {sub_agent} fitness: {fitness[idx]}")
+        print("Previous 5 fitness avgs")
+        for idx, sub_agent in enumerate(agent_ids):
             print(
-                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+                f"  {sub_agent} fitness average: {np.mean(agent.fitness[-5:], axis=0)[idx]}"
             )
 
-            # Tournament selection and population mutation
-            elite, pop = tournament.select(pop)
-            pop = mutations.mutation(pop)
+        # Update step counter
+        agent.steps.append(agent.steps[-1])
 
     # Save the trained algorithm
     path = "./models/MADDPG"
     filename = "MADDPG_trained_agent.pt"
     os.makedirs(path, exist_ok=True)
     save_path = os.path.join(path, filename)
-    elite.saveCheckpoint(save_path)
+    agent.save_checkpoint(save_path)
+
+    pbar.close()
+    env.close()
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
index 46aefb5bf..11335b45a 100644
--- a/tutorials/AgileRL/agilerl_matd3.py
+++ b/tutorials/AgileRL/agilerl_matd3.py
@@ -2,6 +2,7 @@
 
 Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a)
 """
+
 import os
 
 import numpy as np
@@ -9,19 +10,20 @@
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
+from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
 from tqdm import trange
 
 from pettingzoo.mpe import simple_speaker_listener_v4
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("===== AgileRL MATD3 Demo =====")
+    print("===== AgileRL Online Multi-Agent Demo =====")
 
     # Define the network configuration
     NET_CONFIG = {
         "arch": "mlp",  # Network architecture
-        "h_size": [32, 32],  # Actor hidden size
+        "hidden_size": [32, 32],  # Actor hidden size
     }
 
     # Define the initial hyperparameters
@@ -31,36 +33,47 @@
         # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
         "CHANNELS_LAST": False,
         "BATCH_SIZE": 32,  # Batch size
+        "O_U_NOISE": True,  # Ornstein Uhlenbeck action noise
+        "EXPL_NOISE": 0.1,  # Action noise scale
+        "MEAN_NOISE": 0.0,  # Mean action noise
+        "THETA": 0.15,  # Rate of mean reversion in OU noise
+        "DT": 0.01,  # Timestep for OU noise
         "LR_ACTOR": 0.001,  # Actor learning rate
-        "LR_CRITIC": 0.01,  # Critic learning rate
+        "LR_CRITIC": 0.001,  # Critic learning rate
         "GAMMA": 0.95,  # Discount factor
         "MEMORY_SIZE": 100000,  # Max memory buffer size
-        "LEARN_STEP": 5,  # Learning frequency
+        "LEARN_STEP": 100,  # Learning frequency
         "TAU": 0.01,  # For soft update of target parameters
         "POLICY_FREQ": 2,  # Policy frequnecy
     }
 
+    num_envs = 8
     # Define the simple speaker listener environment as a parallel environment
     env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
+    env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
     env.reset()
 
     # Configure the multi-agent algo input arguments
     try:
-        state_dim = [env.observation_space(agent).n for agent in env.agents]
+        state_dim = [env.single_observation_space(agent).n for agent in env.agents]
         one_hot = True
     except Exception:
-        state_dim = [env.observation_space(agent).shape for agent in env.agents]
+        state_dim = [env.single_observation_space(agent).shape for agent in env.agents]
         one_hot = False
     try:
-        action_dim = [env.action_space(agent).n for agent in env.agents]
+        action_dim = [env.single_action_space(agent).n for agent in env.agents]
         INIT_HP["DISCRETE_ACTIONS"] = True
         INIT_HP["MAX_ACTION"] = None
         INIT_HP["MIN_ACTION"] = None
     except Exception:
-        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
+        action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents]
         INIT_HP["DISCRETE_ACTIONS"] = False
-        INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
-        INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
+        INIT_HP["MAX_ACTION"] = [
+            env.single_action_space(agent).high for agent in env.agents
+        ]
+        INIT_HP["MIN_ACTION"] = [
+            env.single_action_space(agent).low for agent in env.agents
+        ]
 
     # Not applicable to MPE environments, used when images are used for observations (Atari environments)
     if INIT_HP["CHANNELS_LAST"]:
@@ -73,7 +86,7 @@
     INIT_HP["AGENT_IDS"] = env.agents
 
     # Create a population ready for evolutionary hyper-parameter optimisation
-    pop = initialPopulation(
+    pop = create_population(
         INIT_HP["ALGO"],
         state_dim,
         action_dim,
@@ -81,6 +94,7 @@
         NET_CONFIG,
         INIT_HP,
         population_size=INIT_HP["POPULATION_SIZE"],
+        num_envs=num_envs,
         device=device,
     )
 
@@ -98,8 +112,8 @@
         tournament_size=2,  # Tournament selection size
         elitism=True,  # Elitism in tournament selection
         population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-        evo_step=1,
-    )  # Evaluate using last N fitness scores
+        eval_loop=1,  # Evaluate using last N fitness scores
+    )
 
     # Instantiate a mutations object (used for HPO)
     mutations = Mutations(
@@ -123,116 +137,148 @@
     )
 
     # Define training loop parameters
-    max_episodes = 500  # Total episodes (default: 6000)
-    max_steps = 25  # Maximum steps to take in each episode
-    epsilon = 1.0  # Starting epsilon value
-    eps_end = 0.1  # Final epsilon value
-    eps_decay = 0.995  # Epsilon decay
-    evo_epochs = 20  # Evolution frequency
-    evo_loop = 1  # Number of evaluation episodes
+    max_steps = 13000  # Max steps (default: 2000000)
+    learning_delay = 0  # Steps before starting learning
+    evo_steps = 1000  # Evolution frequency
+    eval_steps = None  # Evaluation steps per episode - go until done
+    eval_loop = 1  # Number of evaluation episodes
     elite = pop[0]  # Assign a placeholder "elite" agent
 
-    # Training loop
-    for idx_epi in trange(max_episodes):
+    total_steps = 0
+
+    # TRAINING LOOP
+    print("Training...")
+    pbar = trange(max_steps, unit="step")
+    while np.less([agent.steps[-1] for agent in pop], max_steps).all():
+        pop_episode_scores = []
         for agent in pop:  # Loop through population
             state, info = env.reset()  # Reset environment at start of episode
-            agent_reward = {agent_id: 0 for agent_id in env.agents}
+            scores = np.zeros(num_envs)
+            completed_episode_scores = []
+            steps = 0
             if INIT_HP["CHANNELS_LAST"]:
                 state = {
-                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
+                    agent_id: np.moveaxis(s, [-1], [-3])
                     for agent_id, s in state.items()
                 }
 
-            for _ in range(max_steps):
-                agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
-                env_defined_actions = (
-                    info["env_defined_actions"]
-                    if "env_defined_actions" in info.keys()
-                    else None
-                )
-
+            for idx_step in range(evo_steps // num_envs):
                 # Get next action from agent
-                cont_actions, discrete_action = agent.getAction(
-                    state, epsilon, agent_mask, env_defined_actions
+                cont_actions, discrete_action = agent.get_action(
+                    states=state, training=True, infos=info
                 )
                 if agent.discrete_actions:
                     action = discrete_action
                 else:
                     action = cont_actions
 
-                next_state, reward, termination, truncation, info = env.step(
-                    action
-                )  # Act in environment
+                # Act in environment
+                next_state, reward, termination, truncation, info = env.step(action)
+
+                scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
+                total_steps += num_envs
+                steps += num_envs
 
                 # Image processing if necessary for the environment
                 if INIT_HP["CHANNELS_LAST"]:
-                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                     next_state = {
                         agent_id: np.moveaxis(ns, [-1], [-3])
                         for agent_id, ns in next_state.items()
                     }
 
                 # Save experiences to replay buffer
-                memory.save2memory(state, cont_actions, reward, next_state, termination)
-
-                # Collect the reward
-                for agent_id, r in reward.items():
-                    agent_reward[agent_id] += r
+                memory.save_to_memory(
+                    state,
+                    cont_actions,
+                    reward,
+                    next_state,
+                    termination,
+                    is_vectorised=True,
+                )
 
                 # Learn according to learning frequency
-                if (memory.counter % agent.learn_step == 0) and (
-                    len(memory) >= agent.batch_size
+                # Handle learn steps > num_envs
+                if agent.learn_step > num_envs:
+                    learn_step = agent.learn_step // num_envs
+                    if (
+                        idx_step % learn_step == 0
+                        and len(memory) >= agent.batch_size
+                        and memory.counter > learning_delay
+                    ):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
+                # Handle num_envs > learn step; learn multiple times per step in env
+                elif (
+                    len(memory) >= agent.batch_size and memory.counter > learning_delay
                 ):
-                    experiences = memory.sample(
-                        agent.batch_size
-                    )  # Sample replay buffer
-                    agent.learn(experiences)  # Learn according to agent's RL algorithm
+                    for _ in range(num_envs // agent.learn_step):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
 
-                # Update the state
-                if INIT_HP["CHANNELS_LAST"]:
-                    next_state = {
-                        agent_id: np.expand_dims(ns, 0)
-                        for agent_id, ns in next_state.items()
-                    }
                 state = next_state
 
-                # Stop episode if any agents have terminated
-                if any(truncation.values()) or any(termination.values()):
-                    break
-
-            # Save the total episode reward
-            score = sum(agent_reward.values())
-            agent.scores.append(score)
-
-        # Update epsilon for exploration
-        epsilon = max(eps_end, epsilon * eps_decay)
-
-        # Now evolve population if necessary
-        if (idx_epi + 1) % evo_epochs == 0:
-            # Evaluate population
-            fitnesses = [
-                agent.test(
-                    env,
-                    swap_channels=INIT_HP["CHANNELS_LAST"],
-                    max_steps=max_steps,
-                    loop=evo_loop,
-                )
-                for agent in pop
-            ]
+                # Calculate scores and reset noise for finished episodes
+                reset_noise_indices = []
+                term_array = np.array(list(termination.values())).transpose()
+                trunc_array = np.array(list(truncation.values())).transpose()
+                for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+                    if np.any(d) or np.any(t):
+                        completed_episode_scores.append(scores[idx])
+                        agent.scores.append(scores[idx])
+                        scores[idx] = 0
+                        reset_noise_indices.append(idx)
+                agent.reset_action_noise(reset_noise_indices)
 
-            print(f"Episode {idx_epi + 1}/{max_episodes}")
-            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
-            print(
-                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+            pbar.update(evo_steps // len(pop))
+
+            agent.steps[-1] += steps
+            pop_episode_scores.append(completed_episode_scores)
+
+        # Evaluate population
+        fitnesses = [
+            agent.test(
+                env,
+                swap_channels=INIT_HP["CHANNELS_LAST"],
+                max_steps=eval_steps,
+                loop=eval_loop,
             )
+            for agent in pop
+        ]
+        mean_scores = [
+            (
+                np.mean(episode_scores)
+                if len(episode_scores) > 0
+                else "0 completed episodes"
+            )
+            for episode_scores in pop_episode_scores
+        ]
+
+        print(f"--- Global steps {total_steps} ---")
+        print(f"Steps {[agent.steps[-1] for agent in pop]}")
+        print(f"Scores: {mean_scores}")
+        print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
+        print(
+            f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
+        )
 
-            # Tournament selection and population mutation
-            elite, pop = tournament.select(pop)
-            pop = mutations.mutation(pop)
+        # Tournament selection and population mutation
+        elite, pop = tournament.select(pop)
+        pop = mutations.mutation(pop)
+
+        # Update step counter
+        for agent in pop:
+            agent.steps.append(agent.steps[-1])
 
     # Save the trained algorithm
     path = "./models/MATD3"
     filename = "MATD3_trained_agent.pt"
     os.makedirs(path, exist_ok=True)
     save_path = os.path.join(path, filename)
-    elite.saveCheckpoint(save_path)
+    elite.save_checkpoint(save_path)
+
+    pbar.close()
+    env.close()
diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py
index f5a2d4b38..67d3ad9cc 100644
--- a/tutorials/AgileRL/render_agilerl_dqn.py
+++ b/tutorials/AgileRL/render_agilerl_dqn.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 from agilerl.algorithms.dqn import DQN
-from agilerl_dqn_curriculum import Opponent
+from agilerl_dqn_curriculum import Opponent, transform_and_flip
 from PIL import Image, ImageDraw, ImageFont
 
 from pettingzoo.classic import connect_four_v3
@@ -68,16 +68,8 @@ def resize_frames(frames, fraction):
     state_dim = np.zeros(state_dim[0]).flatten().shape
     action_dim = action_dim[0]
 
-    # Instantiate an DQN object
-    dqn = DQN(
-        state_dim,
-        action_dim,
-        one_hot,
-        device=device,
-    )
-
-    # Load the saved algorithm into the DQN object
-    dqn.loadCheckpoint(path)
+    # Load the saved agent
+    dqn = DQN.load(path, device)
 
     for opponent_difficulty in ["random", "weak", "strong", "self"]:
         # Create opponent
@@ -120,38 +112,35 @@ def resize_frames(frames, fraction):
             for idx_step in range(max_steps):
                 action_mask = observation["action_mask"]
                 if player < 0:
-                    state = np.moveaxis(observation["observation"], [-1], [-3])
-                    state = np.expand_dims(state, 0)
+                    state, _ = transform_and_flip(observation, player=0)
                     if opponent_first:
                         if opponent_difficulty == "self":
-                            action = opponent.getAction(
+                            action = opponent.get_action(
                                 state, epsilon=0, action_mask=action_mask
                             )[0]
                         elif opponent_difficulty == "random":
-                            action = opponent.getAction(action_mask)
+                            action = opponent.get_action(action_mask)
                         else:
-                            action = opponent.getAction(player=0)
+                            action = opponent.get_action(player=0)
                     else:
-                        action = dqn.getAction(
+                        action = dqn.get_action(
                             state, epsilon=0, action_mask=action_mask
                         )[
                             0
                         ]  # Get next action from agent
                 if player > 0:
-                    state = np.moveaxis(observation["observation"], [-1], [-3])
-                    state[[0, 1], :, :] = state[[0, 1], :, :]
-                    state = np.expand_dims(state, 0)
+                    state, _ = transform_and_flip(observation, player=1)
                     if not opponent_first:
                         if opponent_difficulty == "self":
-                            action = opponent.getAction(
+                            action = opponent.get_action(
                                 state, epsilon=0, action_mask=action_mask
                             )[0]
                         elif opponent_difficulty == "random":
-                            action = opponent.getAction(action_mask)
+                            action = opponent.get_action(action_mask)
                         else:
-                            action = opponent.getAction(player=1)
+                            action = opponent.get_action(player=1)
                     else:
-                        action = dqn.getAction(
+                        action = dqn.get_action(
                             state, epsilon=0, action_mask=action_mask
                         )[
                             0
diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py
index ca47349d5..2713b48fd 100644
--- a/tutorials/AgileRL/render_agilerl_maddpg.py
+++ b/tutorials/AgileRL/render_agilerl_maddpg.py
@@ -68,22 +68,9 @@ def _label_with_episode_number(frame, episode_num):
     n_agents = env.num_agents
     agent_ids = env.agents
 
-    # Instantiate an MADDPG object
-    maddpg = MADDPG(
-        state_dim,
-        action_dim,
-        one_hot,
-        n_agents,
-        agent_ids,
-        max_action,
-        min_action,
-        discrete_actions,
-        device=device,
-    )
-
-    # Load the saved algorithm into the MADDPG object
+    # Load the saved agent
     path = "./models/MADDPG/MADDPG_trained_agent.pt"
-    maddpg.loadCheckpoint(path)
+    maddpg = MADDPG.load(path, device)
 
     # Define test loop parameters
     episodes = 10  # Number of episodes to test agent on
@@ -106,20 +93,9 @@ def _label_with_episode_number(frame, episode_num):
                     agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1])
                     for agent_id, s in state.items()
                 }
-
-            agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
-            env_defined_actions = (
-                info["env_defined_actions"]
-                if "env_defined_actions" in info.keys()
-                else None
-            )
-
             # Get next action from agent
-            cont_actions, discrete_action = maddpg.getAction(
-                state,
-                epsilon=0,
-                agent_mask=agent_mask,
-                env_defined_actions=env_defined_actions,
+            cont_actions, discrete_action = maddpg.get_action(
+                state, training=False, infos=info
             )
             if maddpg.discrete_actions:
                 action = discrete_action
@@ -131,7 +107,9 @@ def _label_with_episode_number(frame, episode_num):
             frames.append(_label_with_episode_number(frame, episode_num=ep))
 
             # Take action in environment
-            state, reward, termination, truncation, info = env.step(action)
+            state, reward, termination, truncation, info = env.step(
+                {agent: a.squeeze() for agent, a in action.items()}
+            )
 
             # Save agent's reward for this step in this episode
             for agent_id, r in reward.items():
diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py
index efcc610cd..8bfae5673 100644
--- a/tutorials/AgileRL/render_agilerl_matd3.py
+++ b/tutorials/AgileRL/render_agilerl_matd3.py
@@ -55,22 +55,9 @@ def _label_with_episode_number(frame, episode_num):
     n_agents = env.num_agents
     agent_ids = env.agents
 
-    # Instantiate an MADDPG object
-    matd3 = MATD3(
-        state_dim,
-        action_dim,
-        one_hot,
-        n_agents,
-        agent_ids,
-        max_action,
-        min_action,
-        discrete_actions,
-        device=device,
-    )
-
-    # Load the saved algorithm into the MADDPG object
+    # Load the saved agent
     path = "./models/MATD3/MATD3_trained_agent.pt"
-    matd3.loadCheckpoint(path)
+    matd3 = MATD3.load(path, device)
 
     # Define test loop parameters
     episodes = 10  # Number of episodes to test agent on
@@ -94,19 +81,9 @@ def _label_with_episode_number(frame, episode_num):
         agent_reward = {agent_id: 0 for agent_id in agent_ids}
         score = 0
         for _ in range(max_steps):
-            agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
-            env_defined_actions = (
-                info["env_defined_actions"]
-                if "env_defined_actions" in info.keys()
-                else None
-            )
-
             # Get next action from agent
-            cont_actions, discrete_action = matd3.getAction(
-                state,
-                epsilon=0,
-                agent_mask=agent_mask,
-                env_defined_actions=env_defined_actions,
+            cont_actions, discrete_action = matd3.get_action(
+                state, training=False, infos=info
             )
             if matd3.discrete_actions:
                 action = discrete_action
@@ -118,7 +95,9 @@ def _label_with_episode_number(frame, episode_num):
             frames.append(_label_with_episode_number(frame, episode_num=ep))
 
             # Take action in environment
-            state, reward, termination, truncation, info = env.step(action)
+            state, reward, termination, truncation, info = env.step(
+                {agent: a.squeeze() for agent, a in action.items()}
+            )
 
             # Save agent's reward for this step in this episode
             for agent_id, r in reward.items():
diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index dbdd050a0..35b6d42a9 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,5 +1,4 @@
-agilerl==0.1.21; python_version >= '3.9'
-agilerl==0.1.20; python_version < '3.9'
+agilerl==1.0.16; python_version >= '3.10'
 pettingzoo[classic,atari,mpe]>=1.23.1
 SuperSuit>=3.9.0
 torch>=2.0.1

From 9ae9b6abeda96d83493c03e0147a054a3f194caa Mon Sep 17 00:00:00 2001
From: mikepratt1 <michael.pratt160@gmail.com>
Date: Wed, 4 Dec 2024 11:33:53 +0000
Subject: [PATCH 5/6] Update agilerl version

---
 tutorials/AgileRL/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index 1262ee83c..71b63edf6 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,4 +1,4 @@
-agilerl==0.1.22; python_version >= '3.9'
+agilerl==1.0.20; python_version >= '3.9'
 pettingzoo[classic,atari,mpe]>=1.23.1
 SuperSuit>=3.9.0
 torch>=2.0.1

From 802d821e5e21cb71d63145f24a61551b17996eeb Mon Sep 17 00:00:00 2001
From: Michael Pratt <118982716+mikepratt1@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:55:10 +0000
Subject: [PATCH 6/6] Update requirements.txt

- Update python version to 3.10
---
 tutorials/AgileRL/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index 71b63edf6..e5f7d1b3d 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,4 +1,4 @@
-agilerl==1.0.20; python_version >= '3.9'
+agilerl==1.0.20; python_version >= '3.10'
 pettingzoo[classic,atari,mpe]>=1.23.1
 SuperSuit>=3.9.0
 torch>=2.0.1