update agilerl tutorials

Farama-Foundation · Oct 25, 2024 · fddc369 · fddc369
1 parent 0cdf49e
commit fddc369
Show file tree

Hide file tree

Showing 9 changed files with 585 additions and 586 deletions.
diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md
diff --git a/docs/tutorials/agilerl/MADDPG.md b/docs/tutorials/agilerl/MADDPG.md
@@ -21,7 +21,7 @@ To follow this tutorial, you will need to install the dependencies shown below.
 ```
 
 ## Code
-### Train multiple agents using MADDPG
+### Train agents using MADDPG
 The following code should run without any issues. The comments are designed to help you understand how to use PettingZoo with AgileRL. If you have any questions, please feel free to ask in the [Discord server](https://discord.com/invite/eB8HyTA2ux).
 
 ```{eval-rst}

diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
@@ -2,26 +2,28 @@
 
 Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a)
 """
+
 import os
 
 import numpy as np
 import torch
 from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
 from agilerl.hpo.mutation import Mutations
 from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
+from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
 from tqdm import trange
 
 from pettingzoo.mpe import simple_speaker_listener_v4
 
 if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("===== AgileRL MATD3 Demo =====")
+    print("===== AgileRL Online Multi-Agent Demo =====")
 
     # Define the network configuration
     NET_CONFIG = {
         "arch": "mlp",  # Network architecture
-        "h_size": [32, 32],  # Actor hidden size
+        "hidden_size": [32, 32],  # Actor hidden size
     }
 
     # Define the initial hyperparameters
@@ -31,36 +33,47 @@
         # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
         "CHANNELS_LAST": False,
         "BATCH_SIZE": 32,  # Batch size
+        "O_U_NOISE": True,  # Ornstein Uhlenbeck action noise
+        "EXPL_NOISE": 0.1,  # Action noise scale
+        "MEAN_NOISE": 0.0,  # Mean action noise
+        "THETA": 0.15,  # Rate of mean reversion in OU noise
+        "DT": 0.01,  # Timestep for OU noise
         "LR_ACTOR": 0.001,  # Actor learning rate
-        "LR_CRITIC": 0.01,  # Critic learning rate
+        "LR_CRITIC": 0.001,  # Critic learning rate
         "GAMMA": 0.95,  # Discount factor
         "MEMORY_SIZE": 100000,  # Max memory buffer size
-        "LEARN_STEP": 5,  # Learning frequency
+        "LEARN_STEP": 100,  # Learning frequency
         "TAU": 0.01,  # For soft update of target parameters
         "POLICY_FREQ": 2,  # Policy frequnecy
     }
 
+    num_envs = 8
     # Define the simple speaker listener environment as a parallel environment
     env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
+    env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
     env.reset()
 
     # Configure the multi-agent algo input arguments
     try:
-        state_dim = [env.observation_space(agent).n for agent in env.agents]
+        state_dim = [env.single_observation_space(agent).n for agent in env.agents]
         one_hot = True
     except Exception:
-        state_dim = [env.observation_space(agent).shape for agent in env.agents]
+        state_dim = [env.single_observation_space(agent).shape for agent in env.agents]
         one_hot = False
     try:
-        action_dim = [env.action_space(agent).n for agent in env.agents]
+        action_dim = [env.single_action_space(agent).n for agent in env.agents]
         INIT_HP["DISCRETE_ACTIONS"] = True
         INIT_HP["MAX_ACTION"] = None
         INIT_HP["MIN_ACTION"] = None
     except Exception:
-        action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
+        action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents]
         INIT_HP["DISCRETE_ACTIONS"] = False
-        INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
-        INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
+        INIT_HP["MAX_ACTION"] = [
+            env.single_action_space(agent).high for agent in env.agents
+        ]
+        INIT_HP["MIN_ACTION"] = [
+            env.single_action_space(agent).low for agent in env.agents
+        ]
 
     # Not applicable to MPE environments, used when images are used for observations (Atari environments)
     if INIT_HP["CHANNELS_LAST"]:
@@ -73,14 +86,15 @@
     INIT_HP["AGENT_IDS"] = env.agents
 
     # Create a population ready for evolutionary hyper-parameter optimisation
-    pop = initialPopulation(
+    pop = create_population(
         INIT_HP["ALGO"],
         state_dim,
         action_dim,
         one_hot,
         NET_CONFIG,
         INIT_HP,
         population_size=INIT_HP["POPULATION_SIZE"],
+        num_envs=num_envs,
         device=device,
     )
 
@@ -98,8 +112,8 @@
         tournament_size=2,  # Tournament selection size
         elitism=True,  # Elitism in tournament selection
         population_size=INIT_HP["POPULATION_SIZE"],  # Population size
-        evo_step=1,
-    )  # Evaluate using last N fitness scores
+        eval_loop=1,  # Evaluate using last N fitness scores
+    )
 
     # Instantiate a mutations object (used for HPO)
     mutations = Mutations(
@@ -123,116 +137,148 @@
     )
 
     # Define training loop parameters
-    max_episodes = 500  # Total episodes (default: 6000)
-    max_steps = 25  # Maximum steps to take in each episode
-    epsilon = 1.0  # Starting epsilon value
-    eps_end = 0.1  # Final epsilon value
-    eps_decay = 0.995  # Epsilon decay
-    evo_epochs = 20  # Evolution frequency
-    evo_loop = 1  # Number of evaluation episodes
+    max_steps = 13000  # Max steps (default: 2000000)
+    learning_delay = 0  # Steps before starting learning
+    evo_steps = 1000  # Evolution frequency
+    eval_steps = None  # Evaluation steps per episode - go until done
+    eval_loop = 1  # Number of evaluation episodes
     elite = pop[0]  # Assign a placeholder "elite" agent
 
-    # Training loop
-    for idx_epi in trange(max_episodes):
+    total_steps = 0
+
+    # TRAINING LOOP
+    print("Training...")
+    pbar = trange(max_steps, unit="step")
+    while np.less([agent.steps[-1] for agent in pop], max_steps).all():
+        pop_episode_scores = []
         for agent in pop:  # Loop through population
             state, info = env.reset()  # Reset environment at start of episode
-            agent_reward = {agent_id: 0 for agent_id in env.agents}
+            scores = np.zeros(num_envs)
+            completed_episode_scores = []
+            steps = 0
             if INIT_HP["CHANNELS_LAST"]:
                 state = {
-                    agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
+                    agent_id: np.moveaxis(s, [-1], [-3])
                     for agent_id, s in state.items()
                 }
 
-            for _ in range(max_steps):
-                agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
-                env_defined_actions = (
-                    info["env_defined_actions"]
-                    if "env_defined_actions" in info.keys()
-                    else None
-                )
-
+            for idx_step in range(evo_steps // num_envs):
                 # Get next action from agent
-                cont_actions, discrete_action = agent.getAction(
-                    state, epsilon, agent_mask, env_defined_actions
+                cont_actions, discrete_action = agent.get_action(
+                    states=state, training=True, infos=info
                 )
                 if agent.discrete_actions:
                     action = discrete_action
                 else:
                     action = cont_actions
 
-                next_state, reward, termination, truncation, info = env.step(
-                    action
-                )  # Act in environment
+                # Act in environment
+                next_state, reward, termination, truncation, info = env.step(action)
+
+                scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
+                total_steps += num_envs
+                steps += num_envs
 
                 # Image processing if necessary for the environment
                 if INIT_HP["CHANNELS_LAST"]:
-                    state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
                     next_state = {
                         agent_id: np.moveaxis(ns, [-1], [-3])
                         for agent_id, ns in next_state.items()
                     }
 
                 # Save experiences to replay buffer
-                memory.save2memory(state, cont_actions, reward, next_state, termination)
-
-                # Collect the reward
-                for agent_id, r in reward.items():
-                    agent_reward[agent_id] += r
+                memory.save_to_memory(
+                    state,
+                    cont_actions,
+                    reward,
+                    next_state,
+                    termination,
+                    is_vectorised=True,
+                )
 
                 # Learn according to learning frequency
-                if (memory.counter % agent.learn_step == 0) and (
-                    len(memory) >= agent.batch_size
+                # Handle learn steps > num_envs
+                if agent.learn_step > num_envs:
+                    learn_step = agent.learn_step // num_envs
+                    if (
+                        idx_step % learn_step == 0
+                        and len(memory) >= agent.batch_size
+                        and memory.counter > learning_delay
+                    ):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
+                # Handle num_envs > learn step; learn multiple times per step in env
+                elif (
+                    len(memory) >= agent.batch_size and memory.counter > learning_delay
                 ):
-                    experiences = memory.sample(
-                        agent.batch_size
-                    )  # Sample replay buffer
-                    agent.learn(experiences)  # Learn according to agent's RL algorithm
+                    for _ in range(num_envs // agent.learn_step):
+                        # Sample replay buffer
+                        experiences = memory.sample(agent.batch_size)
+                        # Learn according to agent's RL algorithm
+                        agent.learn(experiences)
 
-                # Update the state
-                if INIT_HP["CHANNELS_LAST"]:
-                    next_state = {
-                        agent_id: np.expand_dims(ns, 0)
-                        for agent_id, ns in next_state.items()
-                    }
                 state = next_state
 
-                # Stop episode if any agents have terminated
-                if any(truncation.values()) or any(termination.values()):
-                    break
-
-            # Save the total episode reward
-            score = sum(agent_reward.values())
-            agent.scores.append(score)
-
-        # Update epsilon for exploration
-        epsilon = max(eps_end, epsilon * eps_decay)
-
-        # Now evolve population if necessary
-        if (idx_epi + 1) % evo_epochs == 0:
-            # Evaluate population
-            fitnesses = [
-                agent.test(
-                    env,
-                    swap_channels=INIT_HP["CHANNELS_LAST"],
-                    max_steps=max_steps,
-                    loop=evo_loop,
-                )
-                for agent in pop
-            ]
+                # Calculate scores and reset noise for finished episodes
+                reset_noise_indices = []
+                term_array = np.array(list(termination.values())).transpose()
+                trunc_array = np.array(list(truncation.values())).transpose()
+                for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+                    if np.any(d) or np.any(t):
+                        completed_episode_scores.append(scores[idx])
+                        agent.scores.append(scores[idx])
+                        scores[idx] = 0
+                        reset_noise_indices.append(idx)
+                agent.reset_action_noise(reset_noise_indices)
 
-            print(f"Episode {idx_epi + 1}/{max_episodes}")
-            print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
-            print(
-                f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+            pbar.update(evo_steps // len(pop))
+
+            agent.steps[-1] += steps
+            pop_episode_scores.append(completed_episode_scores)
+
+        # Evaluate population
+        fitnesses = [
+            agent.test(
+                env,
+                swap_channels=INIT_HP["CHANNELS_LAST"],
+                max_steps=eval_steps,
+                loop=eval_loop,
             )
+            for agent in pop
+        ]
+        mean_scores = [
+            (
+                np.mean(episode_scores)
+                if len(episode_scores) > 0
+                else "0 completed episodes"
+            )
+            for episode_scores in pop_episode_scores
+        ]
+
+        print(f"--- Global steps {total_steps} ---")
+        print(f"Steps {[agent.steps[-1] for agent in pop]}")
+        print(f"Scores: {mean_scores}")
+        print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
+        print(
+            f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
+        )
 
-            # Tournament selection and population mutation
-            elite, pop = tournament.select(pop)
-            pop = mutations.mutation(pop)
+        # Tournament selection and population mutation
+        elite, pop = tournament.select(pop)
+        pop = mutations.mutation(pop)
+
+        # Update step counter
+        for agent in pop:
+            agent.steps.append(agent.steps[-1])
 
     # Save the trained algorithm
     path = "./models/MATD3"
     filename = "MATD3_trained_agent.pt"
     os.makedirs(path, exist_ok=True)
     save_path = os.path.join(path, filename)
-    elite.saveCheckpoint(save_path)
+    elite.save_checkpoint(save_path)
+
+    pbar.close()
+    env.close()