diff --git a/.github/workflows/linux-tutorials-test.yml b/.github/workflows/linux-tutorials-test.yml index d559302eb..b31b8cbff 100644 --- a/.github/workflows/linux-tutorials-test.yml +++ b/.github/workflows/linux-tutorials-test.yml @@ -18,7 +18,8 @@ jobs: matrix: python-version: ['3.8', '3.9', '3.10', '3.11'] - tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/test] # TODO: fix tutorials and add back Ray, fix SB3/connect_four tutorial + tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test] # TODO: fix tutorials and add back Ray + steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md index 5b701f28c..9cd9cb30a 100644 --- a/docs/tutorials/agilerl/DQN.md +++ b/docs/tutorials/agilerl/DQN.md @@ -62,7 +62,7 @@ Importing the following packages, functions and classes will enable us to run th from agilerl.components.replay_buffer import ReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection - from agilerl.utils.utils import initialPopulation + from agilerl.utils.utils import create_population from tqdm import tqdm, trange from pettingzoo.classic import connect_four_v3 @@ -167,27 +167,23 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as while not (done or truncation): # Player 0's turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player = 0) if opponent_first: p0_action = self.env.action_space("player_0").sample(p0_action_mask) else: if self.lesson["warm_up_opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, self.lesson["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) self.step(p0_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p0_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0) - p0_next_state = np.expand_dims(p0_next_state, 0) + p0_next_state, p0_next_state_flipped = transform_and_flip(observation, player = 0) if done or truncation: reward = self.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -211,7 +207,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as else: # Play continues if p1_state is not None: reward = self.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -221,31 +217,25 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as # Player 1's turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip(observation, player = 1) if not opponent_first: p1_action = self.env.action_space("player_1").sample( p1_action_mask ) else: if self.lesson["warm_up_opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"] ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) self.step(p1_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p1_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0) - p1_next_state = np.expand_dims(p1_next_state, 0) + p1_next_state, p1_next_state_flipped = transform_and_flip(observation, player = 1) if done or truncation: reward = self.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -269,7 +259,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as else: # Play continues reward = self.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -431,11 +421,11 @@ When defining the different lessons in our curriculum, we can increase the diffi self.env = env.env self.difficulty = difficulty if self.difficulty == "random": - self.getAction = self.random_opponent + self.get_action = self.random_opponent elif self.difficulty == "weak": - self.getAction = self.weak_rule_based_opponent + self.get_action = self.weak_rule_based_opponent else: - self.getAction = self.strong_rule_based_opponent + self.get_action = self.strong_rule_based_opponent self.num_cols = 7 self.num_rows = 6 self.length = 4 @@ -640,7 +630,6 @@ Before we go any further in this tutorial, it would be helpful to define and set "NUM_ATOMS": 51, # Unit number of support "V_MIN": 0.0, # Minimum value of support "V_MAX": 200.0, # Maximum value of support - "WANDB": False, # Use Weights and Biases tracking } # Define the connect four environment @@ -667,7 +656,7 @@ Before we go any further in this tutorial, it would be helpful to define and set action_dim = action_dim[0] # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -681,7 +670,6 @@ Before we go any further in this tutorial, it would be helpful to define and set # Configure the replay buffer field_names = ["state", "action", "reward", "next_state", "done"] memory = ReplayBuffer( - action_dim=action_dim, # Number of agent actions memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size field_names=field_names, # Field names to store in memory device=device, @@ -692,8 +680,8 @@ Before we go any further in this tutorial, it would be helpful to define and set tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -733,7 +721,6 @@ Before we go any further in this tutorial, it would be helpful to define and set eps_end = 0.1 # Final epsilon value eps_decay = 0.9998 # Epsilon decays opp_update_counter = 0 - wb = INIT_HP["WANDB"] ``` @@ -745,6 +732,7 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran ```python # Perform buffer and agent warmups if desired + # Perform buffer and agent warmups if desired if LESSON["buffer_warm_up"]: warm_up_opponent = Opponent(env, difficulty=LESSON["warm_up_opponent"]) memory = env.fill_replay_buffer( @@ -763,6 +751,33 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran ``` +The observation space of Connect Four is (6, 7, 2), where the first two dimensions represent the board and the third dimension represents the player. As PyTorch uses channels-first by default, we need to preprocess the observation. Moreover, we need to flip and swap the planes of the observation to account for the fact that the agent will play as both player 0 and player 1. We can define a function to do this as follows: + +
+ Tansform and Flip + + ```python + def transform_and_flip(observation, player): + """Transforms and flips observation for input to agent's neural network. + + :param observation: Observation to preprocess + :type observation: dict[str, np.ndarray] + :param player: Player, 0 or 1 + :type player: int + """ + state = observation["observation"] + # Pre-process dimensions for PyTorch (N, C, H, W) + state = np.moveaxis(state, [-1], [-3]) + if player == 1: + # Swap pieces so that the agent always sees the board from the same perspective + state[[0, 1], :, :] = state[[1, 0], :, :] + state_flipped = np.expand_dims(np.flip(state, 2), 0) + state = np.expand_dims(state, 0) + return state, state_flipped + ``` +
+ + ### Self-play In this tutorial, we use self-play as the final lesson in our curriculum. By iteratively improving our agent and making it learn to win against itself, we can allow it to discover new strategies and achieve higher performance. The weights of our pretrained agent from an earlier lesson can be loaded to the population as follows: @@ -774,7 +789,7 @@ In this tutorial, we use self-play as the final lesson in our curriculum. By ite if LESSON["pretrained_path"] is not None: for agent in pop: # Load pretrained checkpoint - agent.loadCheckpoint(LESSON["pretrained_path"]) + agent.load_checkpoint(LESSON["pretrained_path"]) # Reinit optimizer for new task agent.lr = INIT_HP["LR"] agent.optimizer = torch.optim.Adam( @@ -824,24 +839,23 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents ```python if max_episodes > 0: - if wb: - wandb.init( - # set the wandb project where this run will be logged - project="AgileRL", - name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( - "connect_four_v3", - INIT_HP["ALGO"], - LESSON["opponent"], - datetime.now().strftime("%m%d%Y%H%M%S"), - ), - # track hyperparameters and run metadata - config={ - "algo": "Evo HPO Rainbow DQN", - "env": "connect_four_v3", - "INIT_HP": INIT_HP, - "lesson": LESSON, - }, - ) + wandb.init( + # set the wandb project where this run will be logged + project="AgileRL", + name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( + "connect_four_v3", + INIT_HP["ALGO"], + LESSON["opponent"], + datetime.now().strftime("%m%d%Y%H%M%S"), + ), + # track hyperparameters and run metadata + config={ + "algo": "Evo HPO Rainbow DQN", + "env": "connect_four_v3", + "INIT_HP": INIT_HP, + "lesson": LESSON, + }, + ) total_steps = 0 total_episodes = 0 @@ -854,7 +868,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents for agent in pop: # Loop through population for episode in range(episodes_per_epoch): env.reset() # Reset environment at start of episode - observation, env_reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() ( p1_state, @@ -883,23 +897,21 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents for idx_step in range(max_steps): # Player 0"s turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player = 0) if opponent_first: if LESSON["opponent"] == "self": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_state, 0, p0_action_mask )[0] elif LESSON["opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, LESSON["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) else: - p0_action = agent.getAction( + p0_action = agent.get_action( p0_state, epsilon, p0_action_mask )[ 0 @@ -907,23 +919,18 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents train_actions_hist[p0_action] += 1 env.step(p0_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p0_next_state = np.moveaxis( - observation["observation"], [-1], [-3] + observation, cumulative_reward, done, truncation, _ = env.last() + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player = 0 ) - p0_next_state_flipped = np.expand_dims( - np.flip(p0_next_state, 2), 0 - ) - p0_next_state = np.expand_dims(p0_next_state, 0) - if not opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 0 win) if done or truncation: reward = env.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -952,7 +959,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents else: # Play continues if p1_state is not None: reward = env.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -964,29 +971,23 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents # Player 1"s turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - # Swap pieces so that the agent always sees the board from the same perspective - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip(observation, player = 1) if not opponent_first: if LESSON["opponent"] == "self": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_state, 0, p1_action_mask )[0] elif LESSON["opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"], ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) else: - p1_action = agent.getAction( + p1_action = agent.get_action( p1_state, epsilon, p1_action_mask )[ 0 @@ -994,24 +995,19 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents train_actions_hist[p1_action] += 1 env.step(p1_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p1_next_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims( - np.flip(p1_next_state, 2), 0 + observation, cumulative_reward, done, truncation, _ = env.last() + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player = 1 ) - p1_next_state = np.expand_dims(p1_next_state, 0) if opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 1 win) if done or truncation: reward = env.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -1045,7 +1041,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents else: # Play continues reward = env.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -1100,7 +1096,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents rewards = [] for i in range(evo_loop): env.reset() # Reset environment at start of episode - observation, reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() player = -1 # Tracker for which player"s turn it is @@ -1120,42 +1116,42 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents if player < 0: if opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action(state, 0, action_mask)[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 if player > 0: if not opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) - state[[0, 1], :, :] = state[[0, 1], :, :] + state[[0, 1], :, :] = state[[1, 0], :, :] state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action(state, 0, action_mask)[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 env.step(action) # Act in environment - observation, reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() if (player > 0 and opponent_first) or ( player < 0 and not opponent_first ): - score += reward + score = cumulative_reward eval_turns += 1 @@ -1192,34 +1188,34 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents for index, action in enumerate(eval_actions_hist) } - if wb: - wandb_dict = { - "global_step": total_steps, - "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), - "train/mean_turns_per_game": mean_turns, - "train/epsilon": epsilon, - "train/opponent_updates": opp_update_counter, - "eval/mean_fitness": np.mean(fitnesses), - "eval/best_fitness": np.max(fitnesses), - "eval/mean_turns_per_game": eval_turns, - } - wandb_dict.update(train_actions_dict) - wandb_dict.update(eval_actions_dict) - wandb.log(wandb_dict) + wandb_dict = { + "global_step": total_steps, + "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), + "train/mean_turns_per_game": mean_turns, + "train/epsilon": epsilon, + "train/opponent_updates": opp_update_counter, + "eval/mean_fitness": np.mean(fitnesses), + "eval/best_fitness": np.max(fitnesses), + "eval/mean_turns_per_game": eval_turns, + } + wandb_dict.update(train_actions_dict) + wandb_dict.update(eval_actions_dict) + wandb.log(wandb_dict) # Tournament selection and population mutation elite, pop = tournament.select(pop) pop = mutations.mutation(pop) if max_episodes > 0: - if wb: - wandb.finish() + wandb.finish() # Save the trained agent save_path = LESSON["save_path"] os.makedirs(os.path.dirname(save_path), exist_ok=True) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) print(f"Elite agent saved to '{save_path}'.") + + pbar.close() ``` diff --git a/docs/tutorials/agilerl/MADDPG.md b/docs/tutorials/agilerl/MADDPG.md index bc6c52e8b..7052b8b1a 100644 --- a/docs/tutorials/agilerl/MADDPG.md +++ b/docs/tutorials/agilerl/MADDPG.md @@ -21,7 +21,7 @@ To follow this tutorial, you will need to install the dependencies shown below. ``` ## Code -### Train multiple agents using MADDPG +### Train agents using MADDPG The following code should run without any issues. The comments are designed to help you understand how to use PettingZoo with AgileRL. If you have any questions, please feel free to ask in the [Discord server](https://discord.com/invite/eB8HyTA2ux). ```{eval-rst} diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py index a56464c5f..6b9ec9770 100644 --- a/tutorials/AgileRL/agilerl_dqn_curriculum.py +++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py @@ -2,6 +2,7 @@ Author: Nick (https://github.com/nicku-a) """ + import copy import os import random @@ -15,7 +16,7 @@ from agilerl.components.replay_buffer import ReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population from tqdm import tqdm, trange from pettingzoo.classic import connect_four_v3 @@ -66,27 +67,25 @@ def fill_replay_buffer(self, memory, opponent): while not (done or truncation): # Player 0's turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player=0) if opponent_first: p0_action = self.env.action_space("player_0").sample(p0_action_mask) else: if self.lesson["warm_up_opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, self.lesson["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) self.step(p0_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p0_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0) - p0_next_state = np.expand_dims(p0_next_state, 0) + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player=0 + ) if done or truncation: reward = self.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -110,7 +109,7 @@ def fill_replay_buffer(self, memory, opponent): else: # Play continues if p1_state is not None: reward = self.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -120,31 +119,29 @@ def fill_replay_buffer(self, memory, opponent): # Player 1's turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip( + observation, player=1 + ) if not opponent_first: p1_action = self.env.action_space("player_1").sample( p1_action_mask ) else: if self.lesson["warm_up_opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"] ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) self.step(p1_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p1_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0) - p1_next_state = np.expand_dims(p1_next_state, 0) + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player=1 + ) if done or truncation: reward = self.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -168,7 +165,7 @@ def fill_replay_buffer(self, memory, opponent): else: # Play continues reward = self.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -323,11 +320,11 @@ def __init__(self, env, difficulty): self.env = env.env self.difficulty = difficulty if self.difficulty == "random": - self.getAction = self.random_opponent + self.get_action = self.random_opponent elif self.difficulty == "weak": - self.getAction = self.weak_rule_based_opponent + self.get_action = self.weak_rule_based_opponent else: - self.getAction = self.strong_rule_based_opponent + self.get_action = self.strong_rule_based_opponent self.num_cols = 7 self.num_rows = 6 self.length = 4 @@ -482,6 +479,25 @@ def outcome(self, action, player, return_length=False): return (True, reward, ended) + ((lengths,) if return_length else ()) +def transform_and_flip(observation, player): + """Transforms and flips observation for input to agent's neural network. + + :param observation: Observation to preprocess + :type observation: dict[str, np.ndarray] + :param player: Player, 0 or 1 + :type player: int + """ + state = observation["observation"] + # Pre-process dimensions for PyTorch (N, C, H, W) + state = np.moveaxis(state, [-1], [-3]) + if player == 1: + # Swap pieces so that the agent always sees the board from the same perspective + state[[0, 1], :, :] = state[[1, 0], :, :] + state_flipped = np.expand_dims(np.flip(state, 2), 0) + state = np.expand_dims(state, 0) + return state, state_flipped + + if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("===== AgileRL Curriculum Learning Demo =====") @@ -522,7 +538,6 @@ def outcome(self, action, player, return_length=False): "NUM_ATOMS": 51, # Unit number of support "V_MIN": 0.0, # Minimum value of support "V_MAX": 200.0, # Maximum value of support - "WANDB": False, # Use Weights and Biases tracking } # Define the connect four environment @@ -549,7 +564,7 @@ def outcome(self, action, player, return_length=False): action_dim = action_dim[0] # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -563,7 +578,6 @@ def outcome(self, action, player, return_length=False): # Configure the replay buffer field_names = ["state", "action", "reward", "next_state", "done"] memory = ReplayBuffer( - action_dim=action_dim, # Number of agent actions memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size field_names=field_names, # Field names to store in memory device=device, @@ -574,8 +588,8 @@ def outcome(self, action, player, return_length=False): tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -606,12 +620,7 @@ def outcome(self, action, player, return_length=False): # Define training loop parameters episodes_per_epoch = 10 - - # ! NOTE: Uncomment the max_episodes line below to change the number of training episodes. ! # - # It is deliberately set low to allow testing to ensure this tutorial is sound. - max_episodes = 10 - # max_episodes = LESSON["max_train_episodes"] # Total episodes - + max_episodes = LESSON["max_train_episodes"] # Total episodes max_steps = 500 # Maximum steps to take in each episode evo_epochs = 20 # Evolution frequency evo_loop = 50 # Number of evaluation episodes @@ -620,12 +629,11 @@ def outcome(self, action, player, return_length=False): eps_end = 0.1 # Final epsilon value eps_decay = 0.9998 # Epsilon decays opp_update_counter = 0 - wb = INIT_HP["WANDB"] if LESSON["pretrained_path"] is not None: for agent in pop: # Load pretrained checkpoint - agent.loadCheckpoint(LESSON["pretrained_path"]) + agent.load_checkpoint(LESSON["pretrained_path"]) # Reinit optimizer for new task agent.lr = INIT_HP["LR"] agent.optimizer = torch.optim.Adam( @@ -659,24 +667,23 @@ def outcome(self, action, player, return_length=False): print("Agent population warmed up.") if max_episodes > 0: - if wb: - wandb.init( - # set the wandb project where this run will be logged - project="AgileRL", - name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( - "connect_four_v3", - INIT_HP["ALGO"], - LESSON["opponent"], - datetime.now().strftime("%m%d%Y%H%M%S"), - ), - # track hyperparameters and run metadata - config={ - "algo": "Evo HPO Rainbow DQN", - "env": "connect_four_v3", - "INIT_HP": INIT_HP, - "lesson": LESSON, - }, - ) + wandb.init( + # set the wandb project where this run will be logged + project="AgileRL", + name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( + "connect_four_v3", + INIT_HP["ALGO"], + LESSON["opponent"], + datetime.now().strftime("%m%d%Y%H%M%S"), + ), + # track hyperparameters and run metadata + config={ + "algo": "Evo HPO Rainbow DQN", + "env": "connect_four_v3", + "INIT_HP": INIT_HP, + "lesson": LESSON, + }, + ) total_steps = 0 total_episodes = 0 @@ -689,7 +696,7 @@ def outcome(self, action, player, return_length=False): for agent in pop: # Loop through population for episode in range(episodes_per_epoch): env.reset() # Reset environment at start of episode - observation, env_reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() ( p1_state, @@ -718,23 +725,23 @@ def outcome(self, action, player, return_length=False): for idx_step in range(max_steps): # Player 0"s turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip( + observation, player=0 + ) if opponent_first: if LESSON["opponent"] == "self": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_state, 0, p0_action_mask )[0] elif LESSON["opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, LESSON["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) else: - p0_action = agent.getAction( + p0_action = agent.get_action( p0_state, epsilon, p0_action_mask )[ 0 @@ -742,23 +749,18 @@ def outcome(self, action, player, return_length=False): train_actions_hist[p0_action] += 1 env.step(p0_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p0_next_state = np.moveaxis( - observation["observation"], [-1], [-3] + observation, cumulative_reward, done, truncation, _ = env.last() + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player=0 ) - p0_next_state_flipped = np.expand_dims( - np.flip(p0_next_state, 2), 0 - ) - p0_next_state = np.expand_dims(p0_next_state, 0) - if not opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 0 win) if done or truncation: reward = env.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -787,7 +789,7 @@ def outcome(self, action, player, return_length=False): else: # Play continues if p1_state is not None: reward = env.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -799,29 +801,25 @@ def outcome(self, action, player, return_length=False): # Player 1"s turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis( - observation["observation"], [-1], [-3] + p1_state, p1_state_flipped = transform_and_flip( + observation, player=1 ) - # Swap pieces so that the agent always sees the board from the same perspective - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) if not opponent_first: if LESSON["opponent"] == "self": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_state, 0, p1_action_mask )[0] elif LESSON["opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"], ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) else: - p1_action = agent.getAction( + p1_action = agent.get_action( p1_state, epsilon, p1_action_mask )[ 0 @@ -829,24 +827,25 @@ def outcome(self, action, player, return_length=False): train_actions_hist[p1_action] += 1 env.step(p1_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p1_next_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims( - np.flip(p1_next_state, 2), 0 + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player=1 ) - p1_next_state = np.expand_dims(p1_next_state, 0) if opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 1 win) if done or truncation: reward = env.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -880,7 +879,7 @@ def outcome(self, action, player, return_length=False): else: # Play continues reward = env.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -935,7 +934,13 @@ def outcome(self, action, player, return_length=False): rewards = [] for i in range(evo_loop): env.reset() # Reset environment at start of episode - observation, reward, done, truncation, _ = env.last() + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() player = -1 # Tracker for which player"s turn it is @@ -955,42 +960,52 @@ def outcome(self, action, player, return_length=False): if player < 0: if opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action( + state, 0, action_mask + )[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 if player > 0: if not opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) - state[[0, 1], :, :] = state[[0, 1], :, :] + state[[0, 1], :, :] = state[[1, 0], :, :] state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action( + state, 0, action_mask + )[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 env.step(action) # Act in environment - observation, reward, done, truncation, _ = env.last() + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() if (player > 0 and opponent_first) or ( player < 0 and not opponent_first ): - score += reward + score = cumulative_reward eval_turns += 1 @@ -1027,31 +1042,29 @@ def outcome(self, action, player, return_length=False): for index, action in enumerate(eval_actions_hist) } - if wb: - wandb_dict = { - "global_step": total_steps, - "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), - "train/mean_turns_per_game": mean_turns, - "train/epsilon": epsilon, - "train/opponent_updates": opp_update_counter, - "eval/mean_fitness": np.mean(fitnesses), - "eval/best_fitness": np.max(fitnesses), - "eval/mean_turns_per_game": eval_turns, - } - wandb_dict.update(train_actions_dict) - wandb_dict.update(eval_actions_dict) - wandb.log(wandb_dict) + wandb_dict = { + "global_step": total_steps, + "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), + "train/mean_turns_per_game": mean_turns, + "train/epsilon": epsilon, + "train/opponent_updates": opp_update_counter, + "eval/mean_fitness": np.mean(fitnesses), + "eval/best_fitness": np.max(fitnesses), + "eval/mean_turns_per_game": eval_turns, + } + wandb_dict.update(train_actions_dict) + wandb_dict.update(eval_actions_dict) + wandb.log(wandb_dict) # Tournament selection and population mutation elite, pop = tournament.select(pop) pop = mutations.mutation(pop) if max_episodes > 0: - if wb: - wandb.finish() + wandb.finish() # Save the trained agent save_path = LESSON["save_path"] os.makedirs(os.path.dirname(save_path), exist_ok=True) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) print(f"Elite agent saved to '{save_path}'.") diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py index 37e193f40..99d19e17c 100644 --- a/tutorials/AgileRL/agilerl_maddpg.py +++ b/tutorials/AgileRL/agilerl_maddpg.py @@ -2,22 +2,22 @@ Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a) """ + import os +from copy import deepcopy import numpy as np import supersuit as ss import torch from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer -from agilerl.hpo.mutation import Mutations -from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population +from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv from tqdm import trange from pettingzoo.atari import space_invaders_v2 if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print("===== AgileRL MADDPG Demo =====") # Define the network configuration NET_CONFIG = { @@ -31,47 +31,59 @@ # Define the initial hyperparameters INIT_HP = { - "POPULATION_SIZE": 2, + "POPULATION_SIZE": 1, "ALGO": "MADDPG", # Algorithm # Swap image channels dimension from last to first [H, W, C] -> [C, H, W] "CHANNELS_LAST": True, - "BATCH_SIZE": 8, # Batch size + "BATCH_SIZE": 32, # Batch size + "O_U_NOISE": True, # Ornstein Uhlenbeck action noise + "EXPL_NOISE": 0.1, # Action noise scale + "MEAN_NOISE": 0.0, # Mean action noise + "THETA": 0.15, # Rate of mean reversion in OU noise + "DT": 0.01, # Timestep for OU noise "LR_ACTOR": 0.001, # Actor learning rate - "LR_CRITIC": 0.01, # Critic learning rate + "LR_CRITIC": 0.001, # Critic learning rate "GAMMA": 0.95, # Discount factor - "MEMORY_SIZE": 10000, # Max memory buffer size - "LEARN_STEP": 5, # Learning frequency + "MEMORY_SIZE": 100000, # Max memory buffer size + "LEARN_STEP": 100, # Learning frequency "TAU": 0.01, # For soft update of target parameters } + num_envs = 8 # Define the space invaders environment as a parallel environment env = space_invaders_v2.parallel_env() - if INIT_HP["CHANNELS_LAST"]: - # Environment processing for image based observations - env = ss.frame_skip_v0(env, 4) - env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1) - env = ss.color_reduction_v0(env, mode="B") - env = ss.resize_v1(env, x_size=84, y_size=84) - env = ss.frame_stack_v1(env, 4) + + # Environment processing for image based observations + env = ss.frame_skip_v0(env, 4) + env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1) + env = ss.color_reduction_v0(env, mode="B") + env = ss.resize_v1(env, x_size=84, y_size=84) + env = ss.frame_stack_v1(env, 4) + env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)]) + env.reset() # Configure the multi-agent algo input arguments try: - state_dim = [env.observation_space(agent).n for agent in env.agents] + state_dim = [env.single_observation_space(agent).n for agent in env.agents] one_hot = True except Exception: - state_dim = [env.observation_space(agent).shape for agent in env.agents] + state_dim = [env.single_observation_space(agent).shape for agent in env.agents] one_hot = False try: - action_dim = [env.action_space(agent).n for agent in env.agents] + action_dim = [env.single_action_space(agent).n for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = True INIT_HP["MAX_ACTION"] = None INIT_HP["MIN_ACTION"] = None except Exception: - action_dim = [env.action_space(agent).shape[0] for agent in env.agents] + action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = False - INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents] - INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents] + INIT_HP["MAX_ACTION"] = [ + env.single_action_space(agent).high for agent in env.agents + ] + INIT_HP["MIN_ACTION"] = [ + env.single_action_space(agent).low for agent in env.agents + ] # Pre-process image dimensions for pytorch convolutional layers if INIT_HP["CHANNELS_LAST"]: @@ -84,7 +96,7 @@ INIT_HP["AGENT_IDS"] = env.agents # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + agent = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -92,8 +104,9 @@ NET_CONFIG, INIT_HP, population_size=INIT_HP["POPULATION_SIZE"], + num_envs=num_envs, device=device, - ) + )[0] # Configure the multi-agent replay buffer field_names = ["state", "action", "reward", "next_state", "done"] @@ -104,152 +117,138 @@ device=device, ) - # Instantiate a tournament selection object (used for HPO) - tournament = TournamentSelection( - tournament_size=2, # Tournament selection size - elitism=True, # Elitism in tournament selection - population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores - - # Instantiate a mutations object (used for HPO) - mutations = Mutations( - algo=INIT_HP["ALGO"], - no_mutation=0.2, # Probability of no mutation - architecture=0.2, # Probability of architecture mutation - new_layer_prob=0.2, # Probability of new layer mutation - parameters=0.2, # Probability of parameter mutation - activation=0, # Probability of activation function mutation - rl_hp=0.2, # Probability of RL hyperparameter mutation - rl_hp_selection=[ - "lr", - "learn_step", - "batch_size", - ], # RL hyperparams selected for mutation - mutation_sd=0.1, # Mutation strength - # Define search space for each hyperparameter - min_lr=0.0001, - max_lr=0.01, - min_learn_step=1, - max_learn_step=120, - min_batch_size=8, - max_batch_size=64, - agent_ids=INIT_HP["AGENT_IDS"], # Agent IDs - arch=NET_CONFIG["arch"], # MLP or CNN - rand_seed=1, - device=device, - ) - # Define training loop parameters - max_episodes = 5 # Total episodes (default: 6000) - max_steps = 900 # Maximum steps to take in each episode - epsilon = 1.0 # Starting epsilon value - eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay - evo_epochs = 20 # Evolution frequency - evo_loop = 1 # Number of evaluation episodes - elite = pop[0] # Assign a placeholder "elite" agent - - # Training loop - for idx_epi in trange(max_episodes): - for agent in pop: # Loop through population - state, info = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} + agent_ids = deepcopy(env.agents) + max_steps = 20000 # Max steps (default: 2000000) + learning_delay = 500 # Steps before starting learning + training_steps = 10000 # Frequency at which we evaluate training score + eval_steps = None # Evaluation steps per episode - go until done + eval_loop = 1 # Number of evaluation episodes + + total_steps = 0 + + # TRAINING LOOP + print("Training...") + pbar = trange(max_steps, unit="step") + while np.less(agent.steps[-1], max_steps): + state, info = env.reset() # Reset environment at start of episode + scores = np.zeros((num_envs, len(agent_ids))) + completed_episode_scores = [] + steps = 0 + if INIT_HP["CHANNELS_LAST"]: + state = { + agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items() + } + + for idx_step in range(training_steps // num_envs): + # Get next action from agent + cont_actions, discrete_action = agent.get_action( + states=state, training=True, infos=info + ) + if agent.discrete_actions: + action = discrete_action + else: + action = cont_actions + + # Act in environment + action = {agent: env.action_space(agent).sample() for agent in env.agents} + next_state, reward, termination, truncation, info = env.step(action) + if not termination: + assert False + scores += np.array(list(reward.values())).transpose() + total_steps += num_envs + steps += num_envs + + # Image processing if necessary for the environment if INIT_HP["CHANNELS_LAST"]: - state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3]) - for agent_id, s in state.items() + next_state = { + agent_id: np.moveaxis(ns, [-1], [-3]) + for agent_id, ns in next_state.items() } - for _ in range(max_steps): - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - - # Get next action from agent - cont_actions, discrete_action = agent.getAction( - state, epsilon, agent_mask, env_defined_actions - ) - if agent.discrete_actions: - action = discrete_action - else: - action = cont_actions - - next_state, reward, termination, truncation, info = env.step( - action - ) # Act in environment - - # Image processing if necessary for the environment - if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} - next_state = { - agent_id: np.moveaxis(ns, [-1], [-3]) - for agent_id, ns in next_state.items() - } - - # Save experiences to replay buffer - memory.save2memory(state, cont_actions, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r - - # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size + + # Save experiences to replay buffer + memory.save_to_memory( + state, + cont_actions, + reward, + next_state, + termination, + is_vectorised=True, + ) + + # Learn according to learning frequency + # Handle learn steps > num_envs + if agent.learn_step > num_envs: + learn_step = agent.learn_step // num_envs + if ( + idx_step % learn_step == 0 + and len(memory) >= agent.batch_size + and memory.counter > learning_delay ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm - - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } - state = next_state - - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, - ) - for agent in pop - ] - - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + # Handle num_envs > learn step; learn multiple times per step in env + elif len(memory) >= agent.batch_size and memory.counter > learning_delay: + for _ in range(num_envs // agent.learn_step): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + + state = next_state + + # Calculate scores and reset noise for finished episodes + reset_noise_indices = [] + term_array = np.array(list(termination.values())).transpose() + trunc_array = np.array(list(truncation.values())).transpose() + for idx, (d, t) in enumerate(zip(term_array, trunc_array)): + if np.any(d) or np.any(t): + completed_episode_scores.append(scores[idx]) + agent.scores.append(scores[idx]) + scores[idx] = 0 + reset_noise_indices.append(idx) + agent.reset_action_noise(reset_noise_indices) + + pbar.update(training_steps) + + agent.steps[-1] += steps + + # Evaluate population + fitness = agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=eval_steps, + loop=eval_loop, + sum_scores=False, + ) + pop_episode_scores = np.array(completed_episode_scores) + mean_scores = np.mean(pop_episode_scores, axis=0) + + print(f"--- Global steps {total_steps} ---") + print(f"Steps {agent.steps[-1]}") + print("Scores:") + for idx, sub_agent in enumerate(agent_ids): + print(f" {sub_agent} score: {mean_scores[idx]}") + print("Fitness") + for idx, sub_agent in enumerate(agent_ids): + print(f" {sub_agent} fitness: {fitness[idx]}") + print("Previous 5 fitness avgs") + for idx, sub_agent in enumerate(agent_ids): print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' + f" {sub_agent} fitness average: {np.mean(agent.fitness[-5:], axis=0)[idx]}" ) - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) + # Update step counter + agent.steps.append(agent.steps[-1]) # Save the trained algorithm path = "./models/MADDPG" filename = "MADDPG_trained_agent.pt" os.makedirs(path, exist_ok=True) save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + agent.save_checkpoint(save_path) + + pbar.close() + env.close() diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py index cc6ed9009..11335b45a 100644 --- a/tutorials/AgileRL/agilerl_matd3.py +++ b/tutorials/AgileRL/agilerl_matd3.py @@ -2,6 +2,7 @@ Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a) """ + import os import numpy as np @@ -9,14 +10,15 @@ from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population +from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv from tqdm import trange from pettingzoo.mpe import simple_speaker_listener_v4 if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print("===== AgileRL MATD3 Demo =====") + print("===== AgileRL Online Multi-Agent Demo =====") # Define the network configuration NET_CONFIG = { @@ -31,36 +33,47 @@ # Swap image channels dimension from last to first [H, W, C] -> [C, H, W] "CHANNELS_LAST": False, "BATCH_SIZE": 32, # Batch size + "O_U_NOISE": True, # Ornstein Uhlenbeck action noise + "EXPL_NOISE": 0.1, # Action noise scale + "MEAN_NOISE": 0.0, # Mean action noise + "THETA": 0.15, # Rate of mean reversion in OU noise + "DT": 0.01, # Timestep for OU noise "LR_ACTOR": 0.001, # Actor learning rate - "LR_CRITIC": 0.01, # Critic learning rate + "LR_CRITIC": 0.001, # Critic learning rate "GAMMA": 0.95, # Discount factor "MEMORY_SIZE": 100000, # Max memory buffer size - "LEARN_STEP": 5, # Learning frequency + "LEARN_STEP": 100, # Learning frequency "TAU": 0.01, # For soft update of target parameters "POLICY_FREQ": 2, # Policy frequnecy } + num_envs = 8 # Define the simple speaker listener environment as a parallel environment env = simple_speaker_listener_v4.parallel_env(continuous_actions=True) + env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)]) env.reset() # Configure the multi-agent algo input arguments try: - state_dim = [env.observation_space(agent).n for agent in env.agents] + state_dim = [env.single_observation_space(agent).n for agent in env.agents] one_hot = True except Exception: - state_dim = [env.observation_space(agent).shape for agent in env.agents] + state_dim = [env.single_observation_space(agent).shape for agent in env.agents] one_hot = False try: - action_dim = [env.action_space(agent).n for agent in env.agents] + action_dim = [env.single_action_space(agent).n for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = True INIT_HP["MAX_ACTION"] = None INIT_HP["MIN_ACTION"] = None except Exception: - action_dim = [env.action_space(agent).shape[0] for agent in env.agents] + action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = False - INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents] - INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents] + INIT_HP["MAX_ACTION"] = [ + env.single_action_space(agent).high for agent in env.agents + ] + INIT_HP["MIN_ACTION"] = [ + env.single_action_space(agent).low for agent in env.agents + ] # Not applicable to MPE environments, used when images are used for observations (Atari environments) if INIT_HP["CHANNELS_LAST"]: @@ -73,7 +86,7 @@ INIT_HP["AGENT_IDS"] = env.agents # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -81,6 +94,7 @@ NET_CONFIG, INIT_HP, population_size=INIT_HP["POPULATION_SIZE"], + num_envs=num_envs, device=device, ) @@ -98,8 +112,8 @@ tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -123,116 +137,148 @@ ) # Define training loop parameters - max_episodes = 500 # Total episodes (default: 6000) - max_steps = 25 # Maximum steps to take in each episode - epsilon = 1.0 # Starting epsilon value - eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay - evo_epochs = 20 # Evolution frequency - evo_loop = 1 # Number of evaluation episodes + max_steps = 13000 # Max steps (default: 2000000) + learning_delay = 0 # Steps before starting learning + evo_steps = 1000 # Evolution frequency + eval_steps = None # Evaluation steps per episode - go until done + eval_loop = 1 # Number of evaluation episodes elite = pop[0] # Assign a placeholder "elite" agent - # Training loop - for idx_epi in trange(max_episodes): + total_steps = 0 + + # TRAINING LOOP + print("Training...") + pbar = trange(max_steps, unit="step") + while np.less([agent.steps[-1] for agent in pop], max_steps).all(): + pop_episode_scores = [] for agent in pop: # Loop through population state, info = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} + scores = np.zeros(num_envs) + completed_episode_scores = [] + steps = 0 if INIT_HP["CHANNELS_LAST"]: state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3]) + agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items() } - for _ in range(max_steps): - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - + for idx_step in range(evo_steps // num_envs): # Get next action from agent - cont_actions, discrete_action = agent.getAction( - state, epsilon, agent_mask, env_defined_actions + cont_actions, discrete_action = agent.get_action( + states=state, training=True, infos=info ) if agent.discrete_actions: action = discrete_action else: action = cont_actions - next_state, reward, termination, truncation, info = env.step( - action - ) # Act in environment + # Act in environment + next_state, reward, termination, truncation, info = env.step(action) + + scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1) + total_steps += num_envs + steps += num_envs # Image processing if necessary for the environment if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} next_state = { agent_id: np.moveaxis(ns, [-1], [-3]) for agent_id, ns in next_state.items() } # Save experiences to replay buffer - memory.save2memory(state, cont_actions, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r + memory.save_to_memory( + state, + cont_actions, + reward, + next_state, + termination, + is_vectorised=True, + ) # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size + # Handle learn steps > num_envs + if agent.learn_step > num_envs: + learn_step = agent.learn_step // num_envs + if ( + idx_step % learn_step == 0 + and len(memory) >= agent.batch_size + and memory.counter > learning_delay + ): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + # Handle num_envs > learn step; learn multiple times per step in env + elif ( + len(memory) >= agent.batch_size and memory.counter > learning_delay ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm + for _ in range(num_envs // agent.learn_step): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } state = next_state - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, - ) - for agent in pop - ] + # Calculate scores and reset noise for finished episodes + reset_noise_indices = [] + term_array = np.array(list(termination.values())).transpose() + trunc_array = np.array(list(truncation.values())).transpose() + for idx, (d, t) in enumerate(zip(term_array, trunc_array)): + if np.any(d) or np.any(t): + completed_episode_scores.append(scores[idx]) + agent.scores.append(scores[idx]) + scores[idx] = 0 + reset_noise_indices.append(idx) + agent.reset_action_noise(reset_noise_indices) - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') - print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' + pbar.update(evo_steps // len(pop)) + + agent.steps[-1] += steps + pop_episode_scores.append(completed_episode_scores) + + # Evaluate population + fitnesses = [ + agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=eval_steps, + loop=eval_loop, ) + for agent in pop + ] + mean_scores = [ + ( + np.mean(episode_scores) + if len(episode_scores) > 0 + else "0 completed episodes" + ) + for episode_scores in pop_episode_scores + ] + + print(f"--- Global steps {total_steps} ---") + print(f"Steps {[agent.steps[-1] for agent in pop]}") + print(f"Scores: {mean_scores}") + print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}') + print( + f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}' + ) - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) + # Tournament selection and population mutation + elite, pop = tournament.select(pop) + pop = mutations.mutation(pop) + + # Update step counter + for agent in pop: + agent.steps.append(agent.steps[-1]) # Save the trained algorithm path = "./models/MATD3" filename = "MATD3_trained_agent.pt" os.makedirs(path, exist_ok=True) save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) + + pbar.close() + env.close() diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py index f5a2d4b38..67d3ad9cc 100644 --- a/tutorials/AgileRL/render_agilerl_dqn.py +++ b/tutorials/AgileRL/render_agilerl_dqn.py @@ -4,7 +4,7 @@ import numpy as np import torch from agilerl.algorithms.dqn import DQN -from agilerl_dqn_curriculum import Opponent +from agilerl_dqn_curriculum import Opponent, transform_and_flip from PIL import Image, ImageDraw, ImageFont from pettingzoo.classic import connect_four_v3 @@ -68,16 +68,8 @@ def resize_frames(frames, fraction): state_dim = np.zeros(state_dim[0]).flatten().shape action_dim = action_dim[0] - # Instantiate an DQN object - dqn = DQN( - state_dim, - action_dim, - one_hot, - device=device, - ) - - # Load the saved algorithm into the DQN object - dqn.loadCheckpoint(path) + # Load the saved agent + dqn = DQN.load(path, device) for opponent_difficulty in ["random", "weak", "strong", "self"]: # Create opponent @@ -120,38 +112,35 @@ def resize_frames(frames, fraction): for idx_step in range(max_steps): action_mask = observation["action_mask"] if player < 0: - state = np.moveaxis(observation["observation"], [-1], [-3]) - state = np.expand_dims(state, 0) + state, _ = transform_and_flip(observation, player=0) if opponent_first: if opponent_difficulty == "self": - action = opponent.getAction( + action = opponent.get_action( state, epsilon=0, action_mask=action_mask )[0] elif opponent_difficulty == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: - action = dqn.getAction( + action = dqn.get_action( state, epsilon=0, action_mask=action_mask )[ 0 ] # Get next action from agent if player > 0: - state = np.moveaxis(observation["observation"], [-1], [-3]) - state[[0, 1], :, :] = state[[0, 1], :, :] - state = np.expand_dims(state, 0) + state, _ = transform_and_flip(observation, player=1) if not opponent_first: if opponent_difficulty == "self": - action = opponent.getAction( + action = opponent.get_action( state, epsilon=0, action_mask=action_mask )[0] elif opponent_difficulty == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: - action = dqn.getAction( + action = dqn.get_action( state, epsilon=0, action_mask=action_mask )[ 0 diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py index ca47349d5..2713b48fd 100644 --- a/tutorials/AgileRL/render_agilerl_maddpg.py +++ b/tutorials/AgileRL/render_agilerl_maddpg.py @@ -68,22 +68,9 @@ def _label_with_episode_number(frame, episode_num): n_agents = env.num_agents agent_ids = env.agents - # Instantiate an MADDPG object - maddpg = MADDPG( - state_dim, - action_dim, - one_hot, - n_agents, - agent_ids, - max_action, - min_action, - discrete_actions, - device=device, - ) - - # Load the saved algorithm into the MADDPG object + # Load the saved agent path = "./models/MADDPG/MADDPG_trained_agent.pt" - maddpg.loadCheckpoint(path) + maddpg = MADDPG.load(path, device) # Define test loop parameters episodes = 10 # Number of episodes to test agent on @@ -106,20 +93,9 @@ def _label_with_episode_number(frame, episode_num): agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1]) for agent_id, s in state.items() } - - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - # Get next action from agent - cont_actions, discrete_action = maddpg.getAction( - state, - epsilon=0, - agent_mask=agent_mask, - env_defined_actions=env_defined_actions, + cont_actions, discrete_action = maddpg.get_action( + state, training=False, infos=info ) if maddpg.discrete_actions: action = discrete_action @@ -131,7 +107,9 @@ def _label_with_episode_number(frame, episode_num): frames.append(_label_with_episode_number(frame, episode_num=ep)) # Take action in environment - state, reward, termination, truncation, info = env.step(action) + state, reward, termination, truncation, info = env.step( + {agent: a.squeeze() for agent, a in action.items()} + ) # Save agent's reward for this step in this episode for agent_id, r in reward.items(): diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py index efcc610cd..8bfae5673 100644 --- a/tutorials/AgileRL/render_agilerl_matd3.py +++ b/tutorials/AgileRL/render_agilerl_matd3.py @@ -55,22 +55,9 @@ def _label_with_episode_number(frame, episode_num): n_agents = env.num_agents agent_ids = env.agents - # Instantiate an MADDPG object - matd3 = MATD3( - state_dim, - action_dim, - one_hot, - n_agents, - agent_ids, - max_action, - min_action, - discrete_actions, - device=device, - ) - - # Load the saved algorithm into the MADDPG object + # Load the saved agent path = "./models/MATD3/MATD3_trained_agent.pt" - matd3.loadCheckpoint(path) + matd3 = MATD3.load(path, device) # Define test loop parameters episodes = 10 # Number of episodes to test agent on @@ -94,19 +81,9 @@ def _label_with_episode_number(frame, episode_num): agent_reward = {agent_id: 0 for agent_id in agent_ids} score = 0 for _ in range(max_steps): - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - # Get next action from agent - cont_actions, discrete_action = matd3.getAction( - state, - epsilon=0, - agent_mask=agent_mask, - env_defined_actions=env_defined_actions, + cont_actions, discrete_action = matd3.get_action( + state, training=False, infos=info ) if matd3.discrete_actions: action = discrete_action @@ -118,7 +95,9 @@ def _label_with_episode_number(frame, episode_num): frames.append(_label_with_episode_number(frame, episode_num=ep)) # Take action in environment - state, reward, termination, truncation, info = env.step(action) + state, reward, termination, truncation, info = env.step( + {agent: a.squeeze() for agent, a in action.items()} + ) # Save agent's reward for this step in this episode for agent_id, r in reward.items(): diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt index 1262ee83c..e5f7d1b3d 100644 --- a/tutorials/AgileRL/requirements.txt +++ b/tutorials/AgileRL/requirements.txt @@ -1,4 +1,4 @@ -agilerl==0.1.22; python_version >= '3.9' +agilerl==1.0.20; python_version >= '3.10' pettingzoo[classic,atari,mpe]>=1.23.1 SuperSuit>=3.9.0 torch>=2.0.1