diff --git a/.github/workflows/build-publish.yml b/.github/workflows/build-publish.yml index 88079fe4e..a4b69870a 100644 --- a/.github/workflows/build-publish.yml +++ b/.github/workflows/build-publish.yml @@ -31,6 +31,9 @@ jobs: - os: ubuntu-latest python: 311 platform: manylinux_x86_64 + - os: ubuntu-latest + python: 312 + platform: manylinux_x86_64 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml index 7a7139b6b..536cd2e72 100644 --- a/.github/workflows/linux-test.yml +++ b/.github/workflows/linux-test.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/linux-tutorials-test.yml b/.github/workflows/linux-tutorials-test.yml index f74a9b3c5..b31b8cbff 100644 --- a/.github/workflows/linux-tutorials-test.yml +++ b/.github/workflows/linux-tutorials-test.yml @@ -15,9 +15,11 @@ jobs: runs-on: ubuntu-latest strategy: fail-fast: false + matrix: python-version: ['3.8', '3.9', '3.10', '3.11'] tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test] # TODO: fix tutorials and add back Ray + steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 82e7a3d15..83d91809a 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -15,7 +15,7 @@ jobs: matrix: # Big Sur, Monterey os: [macos-11, macos-12] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/README.md b/README.md index 395c9cbe9..adb30215b 100644 --- a/README.md +++ b/README.md @@ -73,11 +73,6 @@ SuperSuit is a library that includes all commonly used wrappers in RL (frame sta PettingZoo keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion. -## Project Maintainers -Project Manager: [Elliot Tower](https://github.com/elliottower/) - -Maintenance for this project is also contributed by the broader Farama team: [farama.org/team](https://farama.org/team). - ## Citation To cite this project in publication, please use @@ -92,3 +87,6 @@ To cite this project in publication, please use year={2021} } ``` +## Project Maintainers +- Project Manager: [David Gerard](https://github.com/David-GERARD) - `david.gerard.23@ucl.ac.uk`. +- Maintenance for this project is also contributed by the broader Farama team: [farama.org/team](https://farama.org/team). diff --git a/docs/api/aec.md b/docs/api/aec.md index 8396c71c9..9248adccc 100644 --- a/docs/api/aec.md +++ b/docs/api/aec.md @@ -94,8 +94,8 @@ The [_Agent Environment Cycle_](https://arxiv.org/abs/2009.13051) (AEC) model wa In an AEC environment, agents act sequentially, receiving updated observations and rewards before taking an action. The environment updates after each agent's step, making it a natural way of representing sequential games such as Chess. The AEC model is flexible enough to handle any type of game that multi-agent RL can consider. -with the underlying environment updating after each agent's step. Agents receive updated observations and rewards at the beginning of their . The environment is updated after every step, -This is a natural way of representing sequential games such as Chess, and +with the underlying environment updating after each agent's step. Agents receive updated observations and rewards at the beginning of their turn. The environment is updated after every step, +This is a natural way of representing sequential games such as Chess and Go. ```{figure} /_static/img/aec_cycle_figure.png :width: 480px diff --git a/docs/api/utils.md b/docs/api/utils.md index abc9d01fc..0b0e319cb 100644 --- a/docs/api/utils.md +++ b/docs/api/utils.md @@ -165,7 +165,7 @@ Base class which is used by [CaptureStdoutWrapper](https://pettingzoo.farama.org The agent selector utility allows for easy cycling of agents in an AEC environment. At any time it can be reset or reinitialized with a new order, allowing for changes in turn order or handling a dynamic number of agents (see [Knights-Archers-Zombies](https://pettingzoo.farama.org/environments/butterfly/knights_archers_zombies/) for an example of spawning/killing agents) -Note: while many PettingZoo environments use agent_selector to manage agent cycling internally, it is not intended to be used externally when interacting with an environment. Instead, use `for agent in env.agent_iter()` (see [AEC API Usage](https://pettingzoo.farama.org/api/aec/#usage)). +Note: while many PettingZoo environments use AgentSelector to manage agent cycling internally, it is not intended to be used externally when interacting with an environment. Instead, use `for agent in env.agent_iter()` (see [AEC API Usage](https://pettingzoo.farama.org/api/aec/#usage)). ```{eval-rst} .. currentmodule:: pettingzoo.utils diff --git a/docs/code_examples/aec_rps.py b/docs/code_examples/aec_rps.py index 7ae982167..7272f75bd 100644 --- a/docs/code_examples/aec_rps.py +++ b/docs/code_examples/aec_rps.py @@ -5,7 +5,7 @@ from gymnasium.spaces import Discrete from pettingzoo import AECEnv -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers ROCK = 0 PAPER = 1 @@ -156,9 +156,9 @@ def reset(self, seed=None, options=None): self.observations = {agent: NONE for agent in self.agents} self.num_moves = 0 """ - Our agent_selector utility allows easy cyclic stepping through the agents list. + Our AgentSelector utility allows easy cyclic stepping through the agents list. """ - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.next() def step(self, action): diff --git a/docs/code_examples/aec_rps_usage.py b/docs/code_examples/aec_rps_usage.py index 71edc4e73..da7d2111d 100644 --- a/docs/code_examples/aec_rps_usage.py +++ b/docs/code_examples/aec_rps_usage.py @@ -1,4 +1,4 @@ -import aec_rps +from . import aec_rps env = aec_rps.env(render_mode="human") env.reset(seed=42) diff --git a/docs/code_examples/parallel_rps_usage.py b/docs/code_examples/parallel_rps_usage.py index 38949eb78..a75aa153d 100644 --- a/docs/code_examples/parallel_rps_usage.py +++ b/docs/code_examples/parallel_rps_usage.py @@ -1,4 +1,4 @@ -import parallel_rps +from . import parallel_rps env = parallel_rps.parallel_env(render_mode="human") observations, infos = env.reset() diff --git a/docs/content/environment_creation.md b/docs/content/environment_creation.md index 8b4332872..4347c49c7 100644 --- a/docs/content/environment_creation.md +++ b/docs/content/environment_creation.md @@ -62,14 +62,14 @@ The utils directory also contain some classes which are only helpful for develop ### Agent selector -The `agent_selector` class steps through agents in a cycle +The `AgentSelector` class steps through agents in a cycle It can be used as follows to cycle through the list of agents: ```python -from pettingzoo.utils import agent_selector +from pettingzoo.utils import AgentSelector agents = ["agent_1", "agent_2", "agent_3"] -selector = agent_selector(agents) +selector = AgentSelector(agents) agent_selection = selector.reset() # agent_selection will be "agent_1" for i in range(100): diff --git a/docs/environments/third_party_envs.md b/docs/environments/third_party_envs.md index aeca31fb9..167f14de9 100644 --- a/docs/environments/third_party_envs.md +++ b/docs/environments/third_party_envs.md @@ -12,6 +12,18 @@ lastpage: ## Environments using the latest versions of PettingZoo *Due to a very recent major release of PettingZoo, there are currently few contributed third-party environments. If you'd like to contribute one, please reach out on [Discord](https://discord.gg/nHg2JRN489).* +### [gfootball-gymnasium-pettingzoo](https://github.com/xihuai18/gfootball-gymnasium-pettingzoo) +[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.24.3-blue)]() +[![GitHub stars](https://img.shields.io/github/stars/xihuai18/gfootball-gymnasium-pettingzoo)]() + +Google Research Football ([GRF](https://github.com/google-research/football)) with Gymnasium and PettingZoo Compatibility. + +### [SMAC and SMACv2 with latest PettingZoo APIs](https://github.com/xihuai18/SMAC-PettingZoo) +[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.24.3-blue)]() +[![GitHub stars](https://img.shields.io/github/stars/xihuai18/gfootball-gymnasium-pettingzoo)]() + +[SMAC](https://github.com/oxwhirl/smac) and [SMACv2](https://github.com/oxwhirl/smacv2) with the latest PettingZoo Parallel APIs. + ### [Sumo-RL](https://github.com/LucasAlegre/sumo-rl) [![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.22.2-blue)]() @@ -57,6 +69,12 @@ CookingZoo: a gym-cooking derivative to simulate a complex cooking environment. A library for doing reinforcement learning using [Crazyflie](https://www.bitcraze.io/products/crazyflie-2-1/) drones. +### [DSSE: Drone Swarm Search Environment](https://github.com/pfeinsper/drone-swarm-search) +[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.22.3-blue)]() +![GitHub stars](https://img.shields.io/github/stars/pfeinsper/drone-swarm-search) + +A single and multi-agent environment to train swarms of drones for maritime search. + ### [PettingZoo Dilemma Envs](https://github.com/tianyu-z/pettingzoo_dilemma_envs) diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md index 5b701f28c..9cd9cb30a 100644 --- a/docs/tutorials/agilerl/DQN.md +++ b/docs/tutorials/agilerl/DQN.md @@ -62,7 +62,7 @@ Importing the following packages, functions and classes will enable us to run th from agilerl.components.replay_buffer import ReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection - from agilerl.utils.utils import initialPopulation + from agilerl.utils.utils import create_population from tqdm import tqdm, trange from pettingzoo.classic import connect_four_v3 @@ -167,27 +167,23 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as while not (done or truncation): # Player 0's turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player = 0) if opponent_first: p0_action = self.env.action_space("player_0").sample(p0_action_mask) else: if self.lesson["warm_up_opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, self.lesson["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) self.step(p0_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p0_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0) - p0_next_state = np.expand_dims(p0_next_state, 0) + p0_next_state, p0_next_state_flipped = transform_and_flip(observation, player = 0) if done or truncation: reward = self.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -211,7 +207,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as else: # Play continues if p1_state is not None: reward = self.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -221,31 +217,25 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as # Player 1's turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip(observation, player = 1) if not opponent_first: p1_action = self.env.action_space("player_1").sample( p1_action_mask ) else: if self.lesson["warm_up_opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"] ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) self.step(p1_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p1_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0) - p1_next_state = np.expand_dims(p1_next_state, 0) + p1_next_state, p1_next_state_flipped = transform_and_flip(observation, player = 1) if done or truncation: reward = self.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -269,7 +259,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as else: # Play continues reward = self.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -431,11 +421,11 @@ When defining the different lessons in our curriculum, we can increase the diffi self.env = env.env self.difficulty = difficulty if self.difficulty == "random": - self.getAction = self.random_opponent + self.get_action = self.random_opponent elif self.difficulty == "weak": - self.getAction = self.weak_rule_based_opponent + self.get_action = self.weak_rule_based_opponent else: - self.getAction = self.strong_rule_based_opponent + self.get_action = self.strong_rule_based_opponent self.num_cols = 7 self.num_rows = 6 self.length = 4 @@ -640,7 +630,6 @@ Before we go any further in this tutorial, it would be helpful to define and set "NUM_ATOMS": 51, # Unit number of support "V_MIN": 0.0, # Minimum value of support "V_MAX": 200.0, # Maximum value of support - "WANDB": False, # Use Weights and Biases tracking } # Define the connect four environment @@ -667,7 +656,7 @@ Before we go any further in this tutorial, it would be helpful to define and set action_dim = action_dim[0] # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -681,7 +670,6 @@ Before we go any further in this tutorial, it would be helpful to define and set # Configure the replay buffer field_names = ["state", "action", "reward", "next_state", "done"] memory = ReplayBuffer( - action_dim=action_dim, # Number of agent actions memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size field_names=field_names, # Field names to store in memory device=device, @@ -692,8 +680,8 @@ Before we go any further in this tutorial, it would be helpful to define and set tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -733,7 +721,6 @@ Before we go any further in this tutorial, it would be helpful to define and set eps_end = 0.1 # Final epsilon value eps_decay = 0.9998 # Epsilon decays opp_update_counter = 0 - wb = INIT_HP["WANDB"] ``` @@ -745,6 +732,7 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran ```python # Perform buffer and agent warmups if desired + # Perform buffer and agent warmups if desired if LESSON["buffer_warm_up"]: warm_up_opponent = Opponent(env, difficulty=LESSON["warm_up_opponent"]) memory = env.fill_replay_buffer( @@ -763,6 +751,33 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran ``` +The observation space of Connect Four is (6, 7, 2), where the first two dimensions represent the board and the third dimension represents the player. As PyTorch uses channels-first by default, we need to preprocess the observation. Moreover, we need to flip and swap the planes of the observation to account for the fact that the agent will play as both player 0 and player 1. We can define a function to do this as follows: + +
+ Tansform and Flip + + ```python + def transform_and_flip(observation, player): + """Transforms and flips observation for input to agent's neural network. + + :param observation: Observation to preprocess + :type observation: dict[str, np.ndarray] + :param player: Player, 0 or 1 + :type player: int + """ + state = observation["observation"] + # Pre-process dimensions for PyTorch (N, C, H, W) + state = np.moveaxis(state, [-1], [-3]) + if player == 1: + # Swap pieces so that the agent always sees the board from the same perspective + state[[0, 1], :, :] = state[[1, 0], :, :] + state_flipped = np.expand_dims(np.flip(state, 2), 0) + state = np.expand_dims(state, 0) + return state, state_flipped + ``` +
+ + ### Self-play In this tutorial, we use self-play as the final lesson in our curriculum. By iteratively improving our agent and making it learn to win against itself, we can allow it to discover new strategies and achieve higher performance. The weights of our pretrained agent from an earlier lesson can be loaded to the population as follows: @@ -774,7 +789,7 @@ In this tutorial, we use self-play as the final lesson in our curriculum. By ite if LESSON["pretrained_path"] is not None: for agent in pop: # Load pretrained checkpoint - agent.loadCheckpoint(LESSON["pretrained_path"]) + agent.load_checkpoint(LESSON["pretrained_path"]) # Reinit optimizer for new task agent.lr = INIT_HP["LR"] agent.optimizer = torch.optim.Adam( @@ -824,24 +839,23 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents ```python if max_episodes > 0: - if wb: - wandb.init( - # set the wandb project where this run will be logged - project="AgileRL", - name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( - "connect_four_v3", - INIT_HP["ALGO"], - LESSON["opponent"], - datetime.now().strftime("%m%d%Y%H%M%S"), - ), - # track hyperparameters and run metadata - config={ - "algo": "Evo HPO Rainbow DQN", - "env": "connect_four_v3", - "INIT_HP": INIT_HP, - "lesson": LESSON, - }, - ) + wandb.init( + # set the wandb project where this run will be logged + project="AgileRL", + name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( + "connect_four_v3", + INIT_HP["ALGO"], + LESSON["opponent"], + datetime.now().strftime("%m%d%Y%H%M%S"), + ), + # track hyperparameters and run metadata + config={ + "algo": "Evo HPO Rainbow DQN", + "env": "connect_four_v3", + "INIT_HP": INIT_HP, + "lesson": LESSON, + }, + ) total_steps = 0 total_episodes = 0 @@ -854,7 +868,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents for agent in pop: # Loop through population for episode in range(episodes_per_epoch): env.reset() # Reset environment at start of episode - observation, env_reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() ( p1_state, @@ -883,23 +897,21 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents for idx_step in range(max_steps): # Player 0"s turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player = 0) if opponent_first: if LESSON["opponent"] == "self": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_state, 0, p0_action_mask )[0] elif LESSON["opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, LESSON["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) else: - p0_action = agent.getAction( + p0_action = agent.get_action( p0_state, epsilon, p0_action_mask )[ 0 @@ -907,23 +919,18 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents train_actions_hist[p0_action] += 1 env.step(p0_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p0_next_state = np.moveaxis( - observation["observation"], [-1], [-3] + observation, cumulative_reward, done, truncation, _ = env.last() + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player = 0 ) - p0_next_state_flipped = np.expand_dims( - np.flip(p0_next_state, 2), 0 - ) - p0_next_state = np.expand_dims(p0_next_state, 0) - if not opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 0 win) if done or truncation: reward = env.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -952,7 +959,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents else: # Play continues if p1_state is not None: reward = env.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -964,29 +971,23 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents # Player 1"s turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - # Swap pieces so that the agent always sees the board from the same perspective - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip(observation, player = 1) if not opponent_first: if LESSON["opponent"] == "self": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_state, 0, p1_action_mask )[0] elif LESSON["opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"], ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) else: - p1_action = agent.getAction( + p1_action = agent.get_action( p1_state, epsilon, p1_action_mask )[ 0 @@ -994,24 +995,19 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents train_actions_hist[p1_action] += 1 env.step(p1_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p1_next_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims( - np.flip(p1_next_state, 2), 0 + observation, cumulative_reward, done, truncation, _ = env.last() + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player = 1 ) - p1_next_state = np.expand_dims(p1_next_state, 0) if opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 1 win) if done or truncation: reward = env.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -1045,7 +1041,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents else: # Play continues reward = env.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -1100,7 +1096,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents rewards = [] for i in range(evo_loop): env.reset() # Reset environment at start of episode - observation, reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() player = -1 # Tracker for which player"s turn it is @@ -1120,42 +1116,42 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents if player < 0: if opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action(state, 0, action_mask)[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 if player > 0: if not opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) - state[[0, 1], :, :] = state[[0, 1], :, :] + state[[0, 1], :, :] = state[[1, 0], :, :] state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action(state, 0, action_mask)[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 env.step(action) # Act in environment - observation, reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() if (player > 0 and opponent_first) or ( player < 0 and not opponent_first ): - score += reward + score = cumulative_reward eval_turns += 1 @@ -1192,34 +1188,34 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents for index, action in enumerate(eval_actions_hist) } - if wb: - wandb_dict = { - "global_step": total_steps, - "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), - "train/mean_turns_per_game": mean_turns, - "train/epsilon": epsilon, - "train/opponent_updates": opp_update_counter, - "eval/mean_fitness": np.mean(fitnesses), - "eval/best_fitness": np.max(fitnesses), - "eval/mean_turns_per_game": eval_turns, - } - wandb_dict.update(train_actions_dict) - wandb_dict.update(eval_actions_dict) - wandb.log(wandb_dict) + wandb_dict = { + "global_step": total_steps, + "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), + "train/mean_turns_per_game": mean_turns, + "train/epsilon": epsilon, + "train/opponent_updates": opp_update_counter, + "eval/mean_fitness": np.mean(fitnesses), + "eval/best_fitness": np.max(fitnesses), + "eval/mean_turns_per_game": eval_turns, + } + wandb_dict.update(train_actions_dict) + wandb_dict.update(eval_actions_dict) + wandb.log(wandb_dict) # Tournament selection and population mutation elite, pop = tournament.select(pop) pop = mutations.mutation(pop) if max_episodes > 0: - if wb: - wandb.finish() + wandb.finish() # Save the trained agent save_path = LESSON["save_path"] os.makedirs(os.path.dirname(save_path), exist_ok=True) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) print(f"Elite agent saved to '{save_path}'.") + + pbar.close() ``` diff --git a/docs/tutorials/agilerl/MADDPG.md b/docs/tutorials/agilerl/MADDPG.md index bc6c52e8b..7052b8b1a 100644 --- a/docs/tutorials/agilerl/MADDPG.md +++ b/docs/tutorials/agilerl/MADDPG.md @@ -21,7 +21,7 @@ To follow this tutorial, you will need to install the dependencies shown below. ``` ## Code -### Train multiple agents using MADDPG +### Train agents using MADDPG The following code should run without any issues. The comments are designed to help you understand how to use PettingZoo with AgileRL. If you have any questions, please feel free to ask in the [Discord server](https://discord.com/invite/eB8HyTA2ux). ```{eval-rst} diff --git a/docs/tutorials/sb3/connect_four.md b/docs/tutorials/sb3/connect_four.md index 8b85f8cca..eef34deac 100644 --- a/docs/tutorials/sb3/connect_four.md +++ b/docs/tutorials/sb3/connect_four.md @@ -4,6 +4,13 @@ title: "SB3: Action Masked PPO for Connect Four" # SB3: Action Masked PPO for Connect Four +```{eval-rst} +.. warning:: + + Currently, this tutorial doesn't work with versions of gymnasium>0.29.1. We are looking into fixing it but it might take some time. + +``` + This tutorial shows how to train a agents using Maskable [Proximal Policy Optimization](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html) (PPO) on the [Connect Four](/environments/classic/chess/) environment ([AEC](/api/aec/)). It creates a custom Wrapper to convert to a [Gymnasium](https://gymnasium.farama.org/)-like environment which is compatible with [SB3 action masking](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html). diff --git a/docs/tutorials/tianshou/index.md b/docs/tutorials/tianshou/index.md index eef3a7d0c..1a879f12c 100644 --- a/docs/tutorials/tianshou/index.md +++ b/docs/tutorials/tianshou/index.md @@ -21,7 +21,7 @@ It boasts a large number of algorithms and high quality software engineering sta ## Examples using PettingZoo -* [Multi-Agent RL](https://tianshou.readthedocs.io/en/master/tutorials/tictactoe.html) +* [Multi-Agent RL](https://tianshou.org/en/master/01_tutorials/04_tictactoe.html) ## Architecture diff --git a/pettingzoo/__init__.py b/pettingzoo/__init__.py index bef7e58da..4000bdbdc 100644 --- a/pettingzoo/__init__.py +++ b/pettingzoo/__init__.py @@ -12,7 +12,7 @@ os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "hide" -__version__ = "1.24.3" +__version__ = "1.24.4" try: import sys diff --git a/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py b/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py index 0751a12e7..4573769fc 100644 --- a/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py +++ b/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py @@ -79,7 +79,7 @@ from pettingzoo.butterfly.cooperative_pong.manual_policy import ManualPolicy from pettingzoo.butterfly.cooperative_pong.paddle import Paddle from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector from pettingzoo.utils.conversions import parallel_wrapper_fn FPS = 15 @@ -370,7 +370,7 @@ def __init__(self, **kwargs): self.agents = self.env.agents[:] self.possible_agents = self.agents[:] - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() # spaces self.action_spaces = dict(zip(self.agents, self.env.action_space)) diff --git a/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py b/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py index 0f21753e6..68a9bdfdc 100644 --- a/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py +++ b/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py @@ -194,7 +194,7 @@ from pettingzoo.butterfly.knights_archers_zombies.src.players import Archer, Knight from pettingzoo.butterfly.knights_archers_zombies.src.weapons import Arrow, Sword from pettingzoo.butterfly.knights_archers_zombies.src.zombie import Zombie -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn sys.dont_write_bytecode = True @@ -370,7 +370,7 @@ def __init__( self.floor_patch3 = get_image(os.path.join("img", "patch3.png")) self.floor_patch4 = get_image(os.path.join("img", "patch4.png")) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.reinit() def observation_space(self, agent): diff --git a/pettingzoo/butterfly/pistonball/pistonball.py b/pettingzoo/butterfly/pistonball/pistonball.py index b15ea2872..65415593b 100644 --- a/pettingzoo/butterfly/pistonball/pistonball.py +++ b/pettingzoo/butterfly/pistonball/pistonball.py @@ -89,7 +89,7 @@ from pettingzoo import AECEnv from pettingzoo.butterfly.pistonball.manual_policy import ManualPolicy -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn _image_library = {} @@ -180,7 +180,7 @@ def __init__( self.agents = ["piston_" + str(r) for r in range(self.n_pistons)] self.possible_agents = self.agents[:] self.agent_name_mapping = dict(zip(self.agents, list(range(self.n_pistons)))) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.observation_spaces = dict( zip( diff --git a/pettingzoo/classic/chess/chess.py b/pettingzoo/classic/chess/chess.py index 5100f8fc3..81b2ccb31 100644 --- a/pettingzoo/classic/chess/chess.py +++ b/pettingzoo/classic/chess/chess.py @@ -116,7 +116,7 @@ from pettingzoo import AECEnv from pettingzoo.classic.chess import chess_utils from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def env(**kwargs): @@ -144,7 +144,7 @@ def __init__(self, render_mode: str | None = None, screen_height: int | None = 8 self.agents = [f"player_{i}" for i in range(2)] self.possible_agents = self.agents[:] - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.action_spaces = {name: spaces.Discrete(8 * 8 * 73) for name in self.agents} self.observation_spaces = { @@ -238,7 +238,7 @@ def reset(self, seed=None, options=None): self.board = chess.Board() - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() self.rewards = {name: 0 for name in self.agents} diff --git a/pettingzoo/classic/connect_four/connect_four.py b/pettingzoo/classic/connect_four/connect_four.py index e2a2390e9..48ce61ce1 100644 --- a/pettingzoo/classic/connect_four/connect_four.py +++ b/pettingzoo/classic/connect_four/connect_four.py @@ -69,7 +69,7 @@ from pettingzoo import AECEnv from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def get_image(path): @@ -220,7 +220,7 @@ def reset(self, seed=None, options=None): self.truncations = {i: False for i in self.agents} self.infos = {i: {} for i in self.agents} - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() diff --git a/pettingzoo/classic/go/go.py b/pettingzoo/classic/go/go.py index 3360f520e..d9a865c67 100644 --- a/pettingzoo/classic/go/go.py +++ b/pettingzoo/classic/go/go.py @@ -81,14 +81,14 @@ | Action ID | Description | | :----------------------------------------------------------: | ------------------------------------------------------------ | -| | Place a stone on the 1st row of the board.
_`0`: (0,0), `1`: (0,1), ..., `N-1`: (0,N-1)_ | -| | Place a stone on the 2nd row of the board.
_`N`: (1,0), `N+1`: (1,1), ..., `2N-1`: (1,N-1)_ | +| $0 \ldots (N-1)$ | Place a stone on the 1st row of the board.
_`0`: (0,0), `1`: (0,1), ..., `N-1`: (0,N-1)_ | +| $N \ldots (2N- 1)$ | Place a stone on the 2nd row of the board.
_`N`: (1,0), `N+1`: (1,1), ..., `2N-1`: (1,N-1)_ | | ... | ... | -| | Place a stone on the Nth row of the board.
_`N^2-N`: (N-1,0), `N^2-N+1`: (N-1,1), ..., `N^2-1`: (N-1,N-1)_ | -| | Pass | +| $(N^2-N) \ldots (N^2-1)$ | Place a stone on the Nth row of the board.
_`N^2-N`: (N-1,0), `N^2-N+1`: (N-1,1), ..., `N^2-1`: (N-1,N-1)_ | +| $N^2$ | Pass | -For example, you would use action `4` to place a stone on the board at the (0,3) location or action `N^2` to pass. You can transform a non-pass action `a` back into its 2D (x,y) coordinate by computing `(a//N, a%N)` The total action space is -. +For example, you would use action `4` to place a stone on the board at the (0,3) location or action `N^2` to pass. You can transform a non-pass action `a` back into its 2D (x,y) coordinate by computing `(a//N, a%N)`. The total action space is +$N^2+1$. ### Rewards @@ -119,7 +119,7 @@ from pettingzoo import AECEnv from pettingzoo.classic.go import coords, go_base from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def get_image(path): @@ -191,7 +191,7 @@ def __init__( [spaces.Discrete(self._N * self._N + 1) for _ in range(self.num_agents)] ) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.board_history = np.zeros((self._N, self._N, 16), dtype=bool) diff --git a/pettingzoo/classic/hanabi/hanabi.py b/pettingzoo/classic/hanabi/hanabi.py index bd2f7480f..bd4441401 100644 --- a/pettingzoo/classic/hanabi/hanabi.py +++ b/pettingzoo/classic/hanabi/hanabi.py @@ -171,7 +171,7 @@ from pettingzoo import AECEnv from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def env(**kwargs): @@ -441,7 +441,7 @@ def reset(self, seed=None, options=None): self.truncations = self.hanabi_env.truncations self.infos = self.hanabi_env.infos - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() def step( diff --git a/pettingzoo/classic/rps/rps.py b/pettingzoo/classic/rps/rps.py index 1b9eb6ad6..83c5abb3f 100644 --- a/pettingzoo/classic/rps/rps.py +++ b/pettingzoo/classic/rps/rps.py @@ -121,7 +121,7 @@ from gymnasium.utils import EzPickle from pettingzoo import AECEnv -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn @@ -419,7 +419,7 @@ def close(self): def reset(self, seed=None, options=None): self.agents = self.possible_agents[:] - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.next() self.rewards = {agent: 0 for agent in self.agents} self._cumulative_rewards = {agent: 0 for agent in self.agents} diff --git a/pettingzoo/classic/tictactoe/board.py b/pettingzoo/classic/tictactoe/board.py index 35186a57a..e6fee6853 100644 --- a/pettingzoo/classic/tictactoe/board.py +++ b/pettingzoo/classic/tictactoe/board.py @@ -1,79 +1,102 @@ +TTT_PLAYER1_WIN = 0 +TTT_PLAYER2_WIN = 1 +TTT_TIE = -1 +TTT_GAME_NOT_OVER = -2 + + class Board: + """Board for a TicTacToe Game. + + This tracks the position and identity of marks on the game board + and allows checking for a winner. + + Example of usage: + + import random + board = Board() + + # random legal moves - for example purposes + def choose_move(board_obj: Board) -> int: + legal_moves = [i for i, mark in enumerate(board_obj.squares) if mark == 0] + return random.choice(legal_moves) + + player = 0 + while True: + move = choose_move(board) + board.play_turn(player, move) + status = board.game_status() + if status != TTT_GAME_NOT_OVER: + if status in [TTT_PLAYER1_WIN, TTT_PLAYER2_WIN]: + print(f"player {status} won") + else: # status == TTT_TIE + print("Tie Game") + break + player = player ^ 1 # swaps between players 0 and 1 + """ + + # indices of the winning lines: vertical(x3), horizontal(x3), diagonal(x2) + winning_combinations = [ + (0, 1, 2), + (3, 4, 5), + (6, 7, 8), + (0, 3, 6), + (1, 4, 7), + (2, 5, 8), + (0, 4, 8), + (2, 4, 6), + ] + def __init__(self): - # internally self.board.squares holds a flat representation of tic tac toe board - # where an empty board is [0, 0, 0, 0, 0, 0, 0, 0, 0] - # where indexes are column wise order + # self.squares holds a flat representation of the tic tac toe board. + # an empty board is [0, 0, 0, 0, 0, 0, 0, 0, 0]. + # player 1's squares are marked 1, while player 2's are marked 2. + # mapping of the flat indices to the 3x3 grid is as follows: # 0 3 6 # 1 4 7 # 2 5 8 - - # empty -- 0 - # player 0 -- 1 - # player 1 -- 2 self.squares = [0] * 9 - # precommute possible winning combinations - self.calculate_winners() + @property + def _n_empty_squares(self): + """The current number of empty squares on the board.""" + return self.squares.count(0) - def setup(self): - self.calculate_winners() + def reset(self): + """Remove all marks from the board.""" + self.squares = [0] * 9 def play_turn(self, agent, pos): - # if spot is empty - if self.squares[pos] != 0: - return - if agent == 0: - self.squares[pos] = 1 - elif agent == 1: - self.squares[pos] = 2 - return - - def calculate_winners(self): - winning_combinations = [] - indices = [x for x in range(0, 9)] - - # Vertical combinations - winning_combinations += [ - tuple(indices[i : (i + 3)]) for i in range(0, len(indices), 3) - ] - - # Horizontal combinations - winning_combinations += [ - tuple(indices[x] for x in range(y, len(indices), 3)) for y in range(0, 3) - ] - - # Diagonal combinations - winning_combinations.append(tuple(x for x in range(0, len(indices), 4))) - winning_combinations.append(tuple(x for x in range(2, len(indices) - 1, 2))) - - self.winning_combinations = winning_combinations - - # returns: - # -1 for no winner - # 1 -- agent 0 wins - # 2 -- agent 1 wins - def check_for_winner(self): - winner = -1 - for combination in self.winning_combinations: - states = [] - for index in combination: - states.append(self.squares[index]) - if all(x == 1 for x in states): - winner = 1 - if all(x == 2 for x in states): - winner = 2 - return winner - - def check_game_over(self): - winner = self.check_for_winner() - - if winner == -1 and all(square in [1, 2] for square in self.squares): - # tie - return True - elif winner in [1, 2]: - return True - else: - return False + """Place a mark by the agent in the spot given. + + The following are required for a move to be valid: + * The agent must be a known agent ID (either 0 or 1). + * The spot must be be empty. + * The spot must be in the board (integer: 0 <= spot <= 8) + + If any of those are not true, an assertion will fail. + """ + assert pos >= 0 and pos <= 8, "Invalid move location" + assert agent in [0, 1], "Invalid agent" + assert self.squares[pos] == 0, "Location is not empty" + + # agent is [0, 1]. board values are stored as [1, 2]. + self.squares[pos] = agent + 1 + + def game_status(self): + """Return status (winner, TTT_TIE if no winner, or TTT_GAME_NOT_OVER).""" + for indices in self.winning_combinations: + states = [self.squares[idx] for idx in indices] + if states == [1, 1, 1]: + return TTT_PLAYER1_WIN + if states == [2, 2, 2]: + return TTT_PLAYER2_WIN + if self._n_empty_squares == 0: + return TTT_TIE + return TTT_GAME_NOT_OVER def __str__(self): return str(self.squares) + + def legal_moves(self): + """Return list of legal moves (as flat indices for spaces on the board).""" + return [i for i, mark in enumerate(self.squares) if mark == 0] diff --git a/pettingzoo/classic/tictactoe/test_board.py b/pettingzoo/classic/tictactoe/test_board.py new file mode 100644 index 000000000..b8f7e9248 --- /dev/null +++ b/pettingzoo/classic/tictactoe/test_board.py @@ -0,0 +1,127 @@ +"""Test cases for TicTacToe board.""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from pettingzoo.classic.tictactoe.board import ( # type: ignore + TTT_GAME_NOT_OVER, + TTT_PLAYER1_WIN, + TTT_PLAYER2_WIN, + TTT_TIE, + Board, +) + +# Note: mapping of moves to board positions are: +# 0 3 6 +# 1 4 7 +# 2 5 8 + +agent2_win = { + "moves": [ + # agent_id, position, board after move + (0, 4, [0, 0, 0, 0, 1, 0, 0, 0, 0]), + (1, 0, [2, 0, 0, 0, 1, 0, 0, 0, 0]), + (0, 2, [2, 0, 1, 0, 1, 0, 0, 0, 0]), + (1, 6, [2, 0, 1, 0, 1, 0, 2, 0, 0]), + (0, 3, [2, 0, 1, 1, 1, 0, 2, 0, 0]), + (1, 7, [2, 0, 1, 1, 1, 0, 2, 2, 0]), + (0, 1, [2, 1, 1, 1, 1, 0, 2, 2, 0]), + (1, 8, [2, 1, 1, 1, 1, 0, 2, 2, 2]), # agent 2 wins here + (0, 5, [2, 1, 1, 1, 1, 1, 2, 2, 2]), + ], + "max_step": 7, # should not get past here + "winner": TTT_PLAYER2_WIN, +} + +tie = { + "moves": [ # should be tie + (0, 0, [1, 0, 0, 0, 0, 0, 0, 0, 0]), + (1, 3, [1, 0, 0, 2, 0, 0, 0, 0, 0]), + (0, 1, [1, 1, 0, 2, 0, 0, 0, 0, 0]), + (1, 4, [1, 1, 0, 2, 2, 0, 0, 0, 0]), + (0, 5, [1, 1, 0, 2, 2, 1, 0, 0, 0]), + (1, 2, [1, 1, 2, 2, 2, 1, 0, 0, 0]), + (0, 6, [1, 1, 2, 2, 2, 1, 1, 0, 0]), + (1, 7, [1, 1, 2, 2, 2, 1, 1, 2, 0]), + (0, 8, [1, 1, 2, 2, 2, 1, 1, 2, 1]), + ], + "max_step": 8, + "winner": TTT_TIE, +} + +agent1_win = { + "moves": [ + (0, 0, [1, 0, 0, 0, 0, 0, 0, 0, 0]), + (1, 3, [1, 0, 0, 2, 0, 0, 0, 0, 0]), + (0, 1, [1, 1, 0, 2, 0, 0, 0, 0, 0]), + (1, 4, [1, 1, 0, 2, 2, 0, 0, 0, 0]), + (0, 2, [1, 1, 1, 2, 2, 0, 0, 0, 0]), # agent 1 should win here + (1, 5, [1, 1, 1, 2, 2, 2, 0, 0, 0]), + (0, 6, [1, 1, 1, 2, 2, 2, 1, 0, 0]), + (1, 7, [1, 1, 1, 2, 2, 2, 1, 2, 0]), + (0, 8, [1, 1, 1, 2, 2, 2, 1, 2, 1]), + ], + "max_step": 4, + "winner": TTT_PLAYER1_WIN, +} + + +@pytest.mark.parametrize("values", [agent1_win, agent2_win, tie]) +def test_tictactoe_board_games(values: dict[str, Any]) -> None: + """Test that TicTacToe games go as expected.""" + expected_winner = values["winner"] + max_step = values["max_step"] + + board = Board() + for i, (agent, pos, board_layout) in enumerate(values["moves"]): + assert i <= max_step, "max step exceed in tictactoe game" + board.play_turn(agent, pos) + assert board_layout == board.squares, "wrong tictactoe layout after move" + status = board.game_status() + if status != TTT_GAME_NOT_OVER: + assert i == max_step, "tictactoe game ended on wrong step" + assert status == expected_winner, "wrong winner in tictactoe board test" + break + + +def test_tictactoe_winning_boards() -> None: + """Test that winning board configurations actually win.""" + # these are the winning lines for player 1. Note that moves + # for player 2 are included to make it a legal board. + winning_lines = [ # vertical(x3), horizontal(x3), diagonal(x2) + [1, 1, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 1, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 1, 1], + [1, 0, 0, 1, 0, 0, 1, 0, 0], + [0, 1, 0, 0, 1, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 1, 0, 0, 1], + [1, 0, 0, 0, 1, 0, 0, 0, 1], + [0, 0, 1, 0, 1, 0, 1, 0, 0], + ] + for line in winning_lines: + board = Board() + board.squares = line + assert board.game_status() == TTT_PLAYER1_WIN, "Bad win check in TicTacToe" + + +def test_tictactoe_bad_move() -> None: + """Test that illegal TicTacToe moves are rejected.""" + board = Board() + # 1) move out of bounds should be rejected + for outside_space in [-1, 9]: + with pytest.raises(AssertionError, match="Invalid move location"): + board.play_turn(0, outside_space) + + # 2) move by unknown agent should be rejected + for unknown_agent in [-1, 2]: + with pytest.raises(AssertionError, match="Invalid agent"): + board.play_turn(unknown_agent, 0) + + # 3) move in occupied space by either agent should be rejected + board.play_turn(0, 4) # this is fine + for agent in [0, 1]: + with pytest.raises(AssertionError, match="Location is not empty"): + board.play_turn(agent, 4) # repeating move is not valid diff --git a/pettingzoo/classic/tictactoe/tictactoe.py b/pettingzoo/classic/tictactoe/tictactoe.py index 45d357b6f..e3c219c5a 100644 --- a/pettingzoo/classic/tictactoe/tictactoe.py +++ b/pettingzoo/classic/tictactoe/tictactoe.py @@ -79,11 +79,12 @@ from gymnasium.utils import EzPickle from pettingzoo import AECEnv -from pettingzoo.classic.tictactoe.board import Board -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.classic.tictactoe.board import TTT_GAME_NOT_OVER, TTT_TIE, Board +from pettingzoo.utils import AgentSelector, wrappers def get_image(path): + """Return a pygame image loaded from the given path.""" from os import path as os_path cwd = os_path.dirname(__file__) @@ -92,6 +93,7 @@ def get_image(path): def get_font(path, size): + """Return a pygame font loaded from the given path.""" from os import path as os_path cwd = os_path.dirname(__file__) @@ -141,9 +143,9 @@ def __init__( self.rewards = {i: 0 for i in self.agents} self.terminations = {i: False for i in self.agents} self.truncations = {i: False for i in self.agents} - self.infos = {i: {"legal_moves": list(range(0, 9))} for i in self.agents} + self.infos = {i: {} for i in self.agents} - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() self.render_mode = render_mode @@ -153,42 +155,38 @@ def __init__( if self.render_mode == "human": self.clock = pygame.time.Clock() - # Key - # ---- - # blank space = 0 - # agent 0 = 1 - # agent 1 = 2 - # An observation is list of lists, where each list represents a row - # - # [[0,0,2] - # [1,2,1] - # [2,1,0]] def observe(self, agent): board_vals = np.array(self.board.squares).reshape(3, 3) cur_player = self.possible_agents.index(agent) opp_player = (cur_player + 1) % 2 - cur_p_board = np.equal(board_vals, cur_player + 1) - opp_p_board = np.equal(board_vals, opp_player + 1) - - observation = np.stack([cur_p_board, opp_p_board], axis=2).astype(np.int8) - legal_moves = self._legal_moves() if agent == self.agent_selection else [] + observation = np.empty((3, 3, 2), dtype=np.int8) + # this will give a copy of the board that is 1 for player 1's + # marks and zero for every other square, whether empty or not. + observation[:, :, 0] = np.equal(board_vals, cur_player + 1) + observation[:, :, 1] = np.equal(board_vals, opp_player + 1) - action_mask = np.zeros(9, "int8") - for i in legal_moves: - action_mask[i] = 1 + action_mask = self._get_mask(agent) return {"observation": observation, "action_mask": action_mask} + def _get_mask(self, agent): + action_mask = np.zeros(9, dtype=np.int8) + + # Per the documentation, the mask of any agent other than the + # currently selected one is all zeros. + if agent == self.agent_selection: + for i in self.board.legal_moves(): + action_mask[i] = 1 + + return action_mask + def observation_space(self, agent): return self.observation_spaces[agent] def action_space(self, agent): return self.action_spaces[agent] - def _legal_moves(self): - return [i for i in range(len(self.board.squares)) if self.board.squares[i] == 0] - # action in this case is a value from 0 to 8 indicating position to move on tictactoe board def step(self, action): if ( @@ -196,45 +194,30 @@ def step(self, action): or self.truncations[self.agent_selection] ): return self._was_dead_step(action) - # check if input action is a valid move (0 == empty spot) - assert self.board.squares[action] == 0, "played illegal move" - # play turn - self.board.play_turn(self.agents.index(self.agent_selection), action) - - # update infos - # list of valid actions (indexes in board) - # next_agent = self.agents[(self.agents.index(self.agent_selection) + 1) % len(self.agents)] - next_agent = self._agent_selector.next() - if self.board.check_game_over(): - winner = self.board.check_for_winner() + self.board.play_turn(self.agents.index(self.agent_selection), action) - if winner == -1: - # tie + status = self.board.game_status() + if status != TTT_GAME_NOT_OVER: + if status == TTT_TIE: pass - elif winner == 1: - # agent 0 won - self.rewards[self.agents[0]] += 1 - self.rewards[self.agents[1]] -= 1 else: - # agent 1 won - self.rewards[self.agents[1]] += 1 - self.rewards[self.agents[0]] -= 1 + winner = status # either TTT_PLAYER1_WIN or TTT_PLAYER2_WIN + loser = winner ^ 1 # 0 -> 1; 1 -> 0 + self.rewards[self.agents[winner]] += 1 + self.rewards[self.agents[loser]] -= 1 # once either play wins or there is a draw, game over, both players are done self.terminations = {i: True for i in self.agents} + self._accumulate_rewards() - # Switch selection to next agents - self._cumulative_rewards[self.agent_selection] = 0 - self.agent_selection = next_agent + self.agent_selection = self._agent_selector.next() - self._accumulate_rewards() if self.render_mode == "human": self.render() def reset(self, seed=None, options=None): - # reset environment - self.board = Board() + self.board.reset() self.agents = self.possible_agents[:] self.rewards = {i: 0 for i in self.agents} @@ -244,10 +227,9 @@ def reset(self, seed=None, options=None): self.infos = {i: {} for i in self.agents} # selects the first agent self._agent_selector.reinit(self.agents) - self._agent_selector.reset() self.agent_selection = self._agent_selector.reset() - if self.screen is None: + if self.render_mode is not None and self.screen is None: pygame.init() if self.render_mode == "human": @@ -255,7 +237,7 @@ def reset(self, seed=None, options=None): (self.screen_height, self.screen_height) ) pygame.display.set_caption("Tic-Tac-Toe") - else: + elif self.render_mode == "rgb_array": self.screen = pygame.Surface((self.screen_height, self.screen_height)) def close(self): diff --git a/pettingzoo/mpe/_mpe_utils/simple_env.py b/pettingzoo/mpe/_mpe_utils/simple_env.py index 6d420fe76..6cc9bb3d2 100644 --- a/pettingzoo/mpe/_mpe_utils/simple_env.py +++ b/pettingzoo/mpe/_mpe_utils/simple_env.py @@ -9,7 +9,7 @@ from pettingzoo import AECEnv from pettingzoo.mpe._mpe_utils.core import Agent from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" @@ -42,6 +42,7 @@ def __init__( render_mode=None, continuous_actions=False, local_ratio=None, + dynamic_rescaling=False, ): super().__init__() @@ -66,6 +67,7 @@ def __init__( self.world = world self.continuous_actions = continuous_actions self.local_ratio = local_ratio + self.dynamic_rescaling = dynamic_rescaling self.scenario.reset_world(self.world, self.np_random) @@ -75,7 +77,7 @@ def __init__( agent.name: idx for idx, agent in enumerate(self.world.agents) } - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) # set spaces self.action_spaces = dict() @@ -116,6 +118,11 @@ def __init__( dtype=np.float32, ) + # Get the original cam_range + # This will be used to scale the rendering + all_poses = [entity.state.p_pos for entity in self.world.entities] + self.original_cam_range = np.max(np.abs(np.array(all_poses))) + self.steps = 0 self.current_actions = [None] * self.num_agents @@ -295,6 +302,10 @@ def draw(self): all_poses = [entity.state.p_pos for entity in self.world.entities] cam_range = np.max(np.abs(np.array(all_poses))) + # The scaling factor is used for dynamic rescaling of the rendering - a.k.a Zoom In/Zoom Out effect + # The 0.9 is a factor to keep the entities from appearing "too" out-of-bounds + scaling_factor = 0.9 * self.original_cam_range / cam_range + # update geometry and text positions text_line = 0 for e, entity in enumerate(self.world.entities): @@ -309,12 +320,15 @@ def draw(self): y = (y / cam_range) * self.height // 2 * 0.9 x += self.width // 2 y += self.height // 2 - pygame.draw.circle( - self.screen, entity.color * 200, (x, y), entity.size * 350 - ) # 350 is an arbitrary scale factor to get pygame to render similar sizes as pyglet - pygame.draw.circle( - self.screen, (0, 0, 0), (x, y), entity.size * 350, 1 - ) # borders + + # 350 is an arbitrary scale factor to get pygame to render similar sizes as pyglet + if self.dynamic_rescaling: + radius = entity.size * 350 * scaling_factor + else: + radius = entity.size * 350 + + pygame.draw.circle(self.screen, entity.color * 200, (x, y), radius) + pygame.draw.circle(self.screen, (0, 0, 0), (x, y), radius, 1) # borders assert ( 0 < x < self.width and 0 < y < self.height ), f"Coordinates {(x, y)} are out of bounds." diff --git a/pettingzoo/mpe/simple/simple.py b/pettingzoo/mpe/simple/simple.py index b9d6f255a..7431c4fb1 100644 --- a/pettingzoo/mpe/simple/simple.py +++ b/pettingzoo/mpe/simple/simple.py @@ -31,7 +31,7 @@ ### Arguments ``` python -simple_v3.env(max_cycles=25, continuous_actions=False) +simple_v3.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -40,6 +40,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -52,7 +54,13 @@ class raw_env(SimpleEnv, EzPickle): - def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): + def __init__( + self, + max_cycles=25, + continuous_actions=False, + render_mode=None, + dynamic_rescaling=False, + ): EzPickle.__init__( self, max_cycles=max_cycles, @@ -68,6 +76,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_v3" diff --git a/pettingzoo/mpe/simple_adversary/simple_adversary.py b/pettingzoo/mpe/simple_adversary/simple_adversary.py index 674790c38..cf7a38499 100644 --- a/pettingzoo/mpe/simple_adversary/simple_adversary.py +++ b/pettingzoo/mpe/simple_adversary/simple_adversary.py @@ -39,7 +39,7 @@ ### Arguments ``` python -simple_adversary_v3.env(N=2, max_cycles=25, continuous_actions=False) +simple_adversary_v3.env(N=2, max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -50,6 +50,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -62,7 +64,14 @@ class raw_env(SimpleEnv, EzPickle): - def __init__(self, N=2, max_cycles=25, continuous_actions=False, render_mode=None): + def __init__( + self, + N=2, + max_cycles=25, + continuous_actions=False, + render_mode=None, + dynamic_rescaling=False, + ): EzPickle.__init__( self, N=N, @@ -79,6 +88,7 @@ def __init__(self, N=2, max_cycles=25, continuous_actions=False, render_mode=Non render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_adversary_v3" diff --git a/pettingzoo/mpe/simple_crypto/simple_crypto.py b/pettingzoo/mpe/simple_crypto/simple_crypto.py index 66a8d2ad1..f74b5f0d1 100644 --- a/pettingzoo/mpe/simple_crypto/simple_crypto.py +++ b/pettingzoo/mpe/simple_crypto/simple_crypto.py @@ -45,7 +45,7 @@ ### Arguments ``` python -simple_crypto_v3.env(max_cycles=25, continuous_actions=False) +simple_crypto_v3.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -54,6 +54,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -73,7 +75,13 @@ class raw_env(SimpleEnv, EzPickle): - def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): + def __init__( + self, + max_cycles=25, + continuous_actions=False, + render_mode=None, + dynamic_rescaling=False, + ): EzPickle.__init__( self, max_cycles=max_cycles, @@ -89,6 +97,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_crypto_v3" diff --git a/pettingzoo/mpe/simple_push/simple_push.py b/pettingzoo/mpe/simple_push/simple_push.py index 1a11a98d8..46b352803 100644 --- a/pettingzoo/mpe/simple_push/simple_push.py +++ b/pettingzoo/mpe/simple_push/simple_push.py @@ -38,13 +38,16 @@ ### Arguments ``` python -simple_push_v3.env(max_cycles=25, continuous_actions=False) +simple_push_v3.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` `max_cycles`: number of frames (a step for each agent) until game terminates +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + + """ import numpy as np @@ -57,7 +60,13 @@ class raw_env(SimpleEnv, EzPickle): - def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): + def __init__( + self, + max_cycles=25, + continuous_actions=False, + render_mode=None, + dynamic_rescaling=False, + ): EzPickle.__init__( self, max_cycles=max_cycles, @@ -73,6 +82,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_push_v3" diff --git a/pettingzoo/mpe/simple_reference/simple_reference.py b/pettingzoo/mpe/simple_reference/simple_reference.py index a934b9014..d058e7d21 100644 --- a/pettingzoo/mpe/simple_reference/simple_reference.py +++ b/pettingzoo/mpe/simple_reference/simple_reference.py @@ -40,7 +40,7 @@ ``` python -simple_reference_v3.env(local_ratio=0.5, max_cycles=25, continuous_actions=False) +simple_reference_v3.env(local_ratio=0.5, max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -51,6 +51,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -64,7 +66,12 @@ class raw_env(SimpleEnv, EzPickle): def __init__( - self, local_ratio=0.5, max_cycles=25, continuous_actions=False, render_mode=None + self, + local_ratio=0.5, + max_cycles=25, + continuous_actions=False, + render_mode=None, + dynamic_rescaling=False, ): EzPickle.__init__( self, @@ -86,6 +93,7 @@ def __init__( max_cycles=max_cycles, continuous_actions=continuous_actions, local_ratio=local_ratio, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_reference_v3" diff --git a/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py b/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py index fbfbe9c85..4fc09e6a3 100644 --- a/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py +++ b/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py @@ -37,7 +37,7 @@ ### Arguments ``` python -simple_speaker_listener_v4.env(max_cycles=25, continuous_actions=False) +simple_speaker_listener_v4.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -46,6 +46,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -58,7 +60,13 @@ class raw_env(SimpleEnv, EzPickle): - def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): + def __init__( + self, + max_cycles=25, + continuous_actions=False, + render_mode=None, + dynamic_rescaling=False, + ): EzPickle.__init__( self, max_cycles=max_cycles, @@ -74,6 +82,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None): render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_speaker_listener_v4" diff --git a/pettingzoo/mpe/simple_spread/simple_spread.py b/pettingzoo/mpe/simple_spread/simple_spread.py index 83e79e53e..4313780ae 100644 --- a/pettingzoo/mpe/simple_spread/simple_spread.py +++ b/pettingzoo/mpe/simple_spread/simple_spread.py @@ -36,7 +36,7 @@ ### Arguments ``` python -simple_spread_v3.env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False) +simple_spread_v3.env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -49,6 +49,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -68,6 +70,7 @@ def __init__( max_cycles=25, continuous_actions=False, render_mode=None, + dynamic_rescaling=False, ): EzPickle.__init__( self, @@ -90,6 +93,7 @@ def __init__( max_cycles=max_cycles, continuous_actions=continuous_actions, local_ratio=local_ratio, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_spread_v3" diff --git a/pettingzoo/mpe/simple_tag/simple_tag.py b/pettingzoo/mpe/simple_tag/simple_tag.py index 7727eb425..1f6c3b48f 100644 --- a/pettingzoo/mpe/simple_tag/simple_tag.py +++ b/pettingzoo/mpe/simple_tag/simple_tag.py @@ -45,7 +45,7 @@ def bound(x): ### Arguments ``` python -simple_tag_v3.env(num_good=1, num_adversaries=3, num_obstacles=2, max_cycles=25, continuous_actions=False) +simple_tag_v3.env(num_good=1, num_adversaries=3, num_obstacles=2, max_cycles=25, continuous_actions=False, dynamic_rescaling=False) ``` @@ -60,6 +60,8 @@ def bound(x): `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -80,6 +82,7 @@ def __init__( max_cycles=25, continuous_actions=False, render_mode=None, + dynamic_rescaling=False, ): EzPickle.__init__( self, @@ -99,6 +102,7 @@ def __init__( render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_tag_v3" diff --git a/pettingzoo/mpe/simple_world_comm/simple_world_comm.py b/pettingzoo/mpe/simple_world_comm/simple_world_comm.py index 598c0d23e..0f2932743 100644 --- a/pettingzoo/mpe/simple_world_comm/simple_world_comm.py +++ b/pettingzoo/mpe/simple_world_comm/simple_world_comm.py @@ -30,11 +30,11 @@ In particular, the good agents reward, is -5 for every collision with an adversary, -2 x bound by the `bound` function described in simple_tag, +2 for every collision with a food, and -0.05 x minimum distance to any food. The adversarial agents are rewarded +5 for collisions and -0.1 x minimum distance to a good agent. s -Good agent observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, self_in_forest]` +Good agent observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, self_in_forest, other_agent_velocities]` Normal adversary observations:`[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, self_in_forest, leader_comm]` -Adversary leader observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, leader_comm]` +Adversary leader observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, self_in_forest, leader_comm]` *Note that when the forests prevent an agent from being seen, the observation of that agents relative position is set to (0,0).* @@ -52,7 +52,7 @@ ``` python simple_world_comm_v3.env(num_good=2, num_adversaries=4, num_obstacles=1, - num_food=2, max_cycles=25, num_forests=2, continuous_actions=False) + num_food=2, max_cycles=25, num_forests=2, continuous_actions=False, dynamic_rescaling=False) ``` @@ -71,6 +71,8 @@ `continuous_actions`: Whether agent action spaces are discrete(default) or continuous +`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size + """ import numpy as np @@ -93,6 +95,7 @@ def __init__( num_forests=2, continuous_actions=False, render_mode=None, + dynamic_rescaling=False, ): EzPickle.__init__( self, @@ -116,6 +119,7 @@ def __init__( render_mode=render_mode, max_cycles=max_cycles, continuous_actions=continuous_actions, + dynamic_rescaling=dynamic_rescaling, ) self.metadata["name"] = "simple_world_comm_v3" diff --git a/pettingzoo/sisl/multiwalker/multiwalker.py b/pettingzoo/sisl/multiwalker/multiwalker.py index 8edf250d1..30adb9fe0 100644 --- a/pettingzoo/sisl/multiwalker/multiwalker.py +++ b/pettingzoo/sisl/multiwalker/multiwalker.py @@ -125,7 +125,7 @@ from pettingzoo import AECEnv from pettingzoo.sisl.multiwalker.multiwalker_base import FPS from pettingzoo.sisl.multiwalker.multiwalker_base import MultiWalkerEnv as _env -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn @@ -156,7 +156,7 @@ def __init__(self, *args, **kwargs): self.agent_name_mapping = dict( zip(self.agents, list(range(self.env.n_walkers))) ) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) # spaces self.action_spaces = dict(zip(self.agents, self.env.action_space)) self.observation_spaces = dict(zip(self.agents, self.env.observation_space)) diff --git a/pettingzoo/sisl/pursuit/pursuit.py b/pettingzoo/sisl/pursuit/pursuit.py index c75728d31..c68f189bb 100644 --- a/pettingzoo/sisl/pursuit/pursuit.py +++ b/pettingzoo/sisl/pursuit/pursuit.py @@ -85,7 +85,7 @@ from pettingzoo import AECEnv from pettingzoo.sisl.pursuit.manual_policy import ManualPolicy from pettingzoo.sisl.pursuit.pursuit_base import Pursuit as _env -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn __all__ = ["ManualPolicy", "env", "parallel_env", "raw_env"] @@ -118,7 +118,7 @@ def __init__(self, *args, **kwargs): self.agents = ["pursuer_" + str(a) for a in range(self.env.num_agents)] self.possible_agents = self.agents[:] self.agent_name_mapping = dict(zip(self.agents, list(range(self.num_agents)))) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) # spaces self.n_act_agents = self.env.act_dims[0] self.action_spaces = dict(zip(self.agents, self.env.action_space)) diff --git a/pettingzoo/sisl/waterworld/waterworld.py b/pettingzoo/sisl/waterworld/waterworld.py index d2de2eb21..df1e31549 100644 --- a/pettingzoo/sisl/waterworld/waterworld.py +++ b/pettingzoo/sisl/waterworld/waterworld.py @@ -33,7 +33,7 @@ poison respectively. The number of features per sensor is 8 by default with `speed_features` enabled, or 5 if `speed_features` is turned off. Therefore with `speed_features` enabled, the observation shape takes the full form of `(8 × n_sensors) + 2`. Elements of the observation vector take on values in the range [-1, 1]. -For example, by default there are 5 agents (purple), 5 food targets (red) and 10 poison targets (green). Each agent has 30 range-limited sensors, depicted by the black lines, to detect neighboring entities (food and poison targets) resulting in 242 element vector of computed values about the +For example, by default there are 5 agents (purple), 5 food targets (green) and 10 poison targets (red). Each agent has 30 range-limited sensors, depicted by the black lines, to detect neighboring entities (food and poison targets) resulting in 242 element vector of computed values about the environment for the observation space. These values represent the distances and speeds sensed by each sensor on the archea. Sensors that do not sense any objects within their range report 0 for speed and 1 for distance. This has been fixed from the reference environments to keep items floating off screen and being lost forever. @@ -141,7 +141,7 @@ from pettingzoo import AECEnv from pettingzoo.sisl.waterworld.waterworld_base import FPS from pettingzoo.sisl.waterworld.waterworld_base import WaterworldBase as _env -from pettingzoo.utils import agent_selector, wrappers +from pettingzoo.utils import AgentSelector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn @@ -171,7 +171,7 @@ def __init__(self, *args, **kwargs): self.agents = ["pursuer_" + str(r) for r in range(self.env.num_agents)] self.possible_agents = self.agents[:] self.agent_name_mapping = dict(zip(self.agents, list(range(self.num_agents)))) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) # spaces self.action_spaces = dict(zip(self.agents, self.env.action_space)) diff --git a/pettingzoo/sisl/waterworld/waterworld_base.py b/pettingzoo/sisl/waterworld/waterworld_base.py index b6705b76b..7c82da4e4 100644 --- a/pettingzoo/sisl/waterworld/waterworld_base.py +++ b/pettingzoo/sisl/waterworld/waterworld_base.py @@ -313,6 +313,8 @@ def draw(self): def add_handlers(self): # Collision handlers for pursuers v.s. evaders & poisons + self.handlers = [] + for pursuer in self.pursuers: for obj in self.evaders: self.handlers.append( diff --git a/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py b/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py index 2985a07c6..1c48d6083 100644 --- a/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py +++ b/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py @@ -5,7 +5,7 @@ from pettingzoo import AECEnv from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def env(): @@ -105,7 +105,7 @@ def reset(self, seed=None, options=None): for i in range(5): self.add_agent(self.np_random.choice(self.types)) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() # seed observation and action spaces diff --git a/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py b/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py index b7cbf2b30..726afa6a9 100644 --- a/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py +++ b/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py @@ -5,7 +5,7 @@ from pettingzoo import AECEnv from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def env(): @@ -107,7 +107,7 @@ def reset(self, seed=None, options=None): for i in range(5): self.add_agent(self.np_random.choice(self.types)) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() # seed observation and action spaces diff --git a/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py b/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py index 7f307d5e8..5b966b174 100644 --- a/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py +++ b/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py @@ -5,7 +5,7 @@ from pettingzoo import AECEnv from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def env(): @@ -99,7 +99,7 @@ def reset(self, seed=None, options=None): for i in range(5): self.add_agent(self.np_random.choice(self.types)) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() # seed observation and action spaces diff --git a/pettingzoo/test/example_envs/generated_agents_env_v0.py b/pettingzoo/test/example_envs/generated_agents_env_v0.py index 28f11469b..827465382 100644 --- a/pettingzoo/test/example_envs/generated_agents_env_v0.py +++ b/pettingzoo/test/example_envs/generated_agents_env_v0.py @@ -5,7 +5,7 @@ from pettingzoo import AECEnv from pettingzoo.utils import wrappers -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector def env(): @@ -99,7 +99,7 @@ def reset(self, seed=None, options=None): for i in range(5): self.add_agent(self.np_random.choice(self.types)) - self._agent_selector = agent_selector(self.agents) + self._agent_selector = AgentSelector(self.agents) self.agent_selection = self._agent_selector.reset() # seed observation and action spaces diff --git a/pettingzoo/utils/__init__.py b/pettingzoo/utils/__init__.py index af9445539..1d16fe76b 100644 --- a/pettingzoo/utils/__init__.py +++ b/pettingzoo/utils/__init__.py @@ -1,4 +1,4 @@ -from pettingzoo.utils.agent_selector import agent_selector +from pettingzoo.utils.agent_selector import AgentSelector from pettingzoo.utils.average_total_reward import average_total_reward from pettingzoo.utils.conversions import ( aec_to_parallel, diff --git a/pettingzoo/utils/agent_selector.py b/pettingzoo/utils/agent_selector.py index 0b6222990..2643b1c9a 100644 --- a/pettingzoo/utils/agent_selector.py +++ b/pettingzoo/utils/agent_selector.py @@ -1,16 +1,17 @@ from __future__ import annotations from typing import Any +from warnings import warn -class agent_selector: +class AgentSelector: """Outputs an agent in the given order whenever agent_select is called. Can reinitialize to a new order. Example: - >>> from pettingzoo.utils import agent_selector - >>> agent_selector = agent_selector(agent_order=["player1", "player2"]) + >>> from pettingzoo.utils import AgentSelector + >>> agent_selector = AgentSelector(agent_order=["player1", "player2"]) >>> agent_selector.reset() 'player1' >>> agent_selector.next() @@ -52,8 +53,8 @@ def is_first(self) -> bool: """Check if the current agent is the first agent in the cycle.""" return self.selected_agent == self.agent_order[0] - def __eq__(self, other: agent_selector) -> bool: - if not isinstance(other, agent_selector): + def __eq__(self, other: AgentSelector) -> bool: + if not isinstance(other, AgentSelector): return NotImplemented return ( @@ -61,3 +62,14 @@ def __eq__(self, other: agent_selector) -> bool: and self._current_agent == other._current_agent and self.selected_agent == other.selected_agent ) + + +class agent_selector(AgentSelector): + """Deprecated version of AgentSelector. Use that instead.""" + + def __init__(self, *args, **kwargs): + warn( + "agent_selector is deprecated, please use AgentSelector", + DeprecationWarning, + ) + super().__init__(*args, **kwargs) diff --git a/pettingzoo/utils/conversions.py b/pettingzoo/utils/conversions.py index 601a1fb06..7cf99f6d9 100644 --- a/pettingzoo/utils/conversions.py +++ b/pettingzoo/utils/conversions.py @@ -4,7 +4,7 @@ from collections import defaultdict from typing import Callable, Dict, Optional -from pettingzoo.utils import agent_selector +from pettingzoo.utils import AgentSelector from pettingzoo.utils.env import ActionType, AECEnv, AgentID, ObsType, ParallelEnv from pettingzoo.utils.wrappers import OrderEnforcingWrapper @@ -309,7 +309,7 @@ def reset(self, seed=None, options=None): self._actions: Dict[AgentID, Optional[ActionType]] = { agent: None for agent in self.agents } - self._agent_selector = agent_selector(self._live_agents) + self._agent_selector = AgentSelector(self._live_agents) self.agent_selection = self._agent_selector.reset() self.terminations = {agent: False for agent in self.agents} self.truncations = {agent: False for agent in self.agents} @@ -377,7 +377,7 @@ def step(self, action: Optional[ActionType]): ] if len(self.env.agents): - self._agent_selector = agent_selector(self.env.agents) + self._agent_selector = AgentSelector(self.env.agents) self.agent_selection = self._agent_selector.reset() self._deads_step_first() diff --git a/pettingzoo/utils/env_logger.py b/pettingzoo/utils/env_logger.py index c5e640e47..bd505e2e3 100644 --- a/pettingzoo/utils/env_logger.py +++ b/pettingzoo/utils/env_logger.py @@ -61,20 +61,6 @@ def warn_action_out_of_bound( f"[WARNING]: Received an action {action} that was outside action space {action_space}. Environment is {backup_policy}" ) - @staticmethod - def warn_close_unrendered_env() -> None: - """Warns: ``[WARNING]: Called close on an unrendered environment.``.""" - EnvLogger._generic_warning( - "[WARNING]: Called close on an unrendered environment." - ) - - @staticmethod - def warn_close_before_reset() -> None: - """Warns: ``[WARNING]: reset() needs to be called before close.``.""" - EnvLogger._generic_warning( - "[WARNING]: reset() needs to be called before close." - ) - @staticmethod def warn_on_illegal_move() -> None: """Warns: ``[WARNING]: Illegal move made, game terminating with current player losing.``.""" diff --git a/pettingzoo/utils/wrappers/order_enforcing.py b/pettingzoo/utils/wrappers/order_enforcing.py index 649c23caa..4a1255682 100644 --- a/pettingzoo/utils/wrappers/order_enforcing.py +++ b/pettingzoo/utils/wrappers/order_enforcing.py @@ -19,11 +19,13 @@ class OrderEnforcingWrapper(BaseWrapper[AgentID, ObsType, ActionType]): """Checks if function calls or attribute access are in a disallowed order. - * error on getting rewards, terminations, truncations, infos, agent_selection before reset - * error on calling step, observe before reset - * error on iterating without stepping or resetting environment. - * warn on calling close before render or reset - * warn on calling step after environment is terminated or truncated + The following are raised: + * AttributeError if any of the following are accessed before reset(): + rewards, terminations, truncations, infos, agent_selection, + num_agents, agents. + * An error if any of the following are called before reset: + render(), step(), observe(), state(), agent_iter() + * A warning if step() is called when there are no agents remaining. """ def __init__(self, env: AECEnv[AgentID, ObsType, ActionType]): @@ -31,37 +33,12 @@ def __init__(self, env: AECEnv[AgentID, ObsType, ActionType]): env, AECEnv ), "OrderEnforcingWrapper is only compatible with AEC environments" self._has_reset = False - self._has_rendered = False self._has_updated = False super().__init__(env) def __getattr__(self, value: str) -> Any: - """Raises an error message when data is gotten from the env. - - Should only be gotten after reset - """ - if value == "unwrapped": - return self.env.unwrapped - elif value == "render_mode" and hasattr(self.env, "render_mode"): - return self.env.render_mode # pyright: ignore[reportGeneralTypeIssues] - elif value == "possible_agents": - try: - return self.env.possible_agents - except AttributeError: - EnvLogger.error_possible_agents_attribute_missing("possible_agents") - elif value == "observation_spaces": - raise AttributeError( - "The base environment does not have an possible_agents attribute. Use the environments `observation_space` method instead" - ) - elif value == "action_spaces": - raise AttributeError( - "The base environment does not have an possible_agents attribute. Use the environments `action_space` method instead" - ) - elif value == "agent_order": - raise AttributeError( - "agent_order has been removed from the API. Please consider using agent_iter instead." - ) - elif ( + """Raises an error if certain data is accessed before reset.""" + if ( value in { "rewards", @@ -75,13 +52,11 @@ def __getattr__(self, value: str) -> Any: and not self._has_reset ): raise AttributeError(f"{value} cannot be accessed before reset") - else: - return super().__getattr__(value) + return super().__getattr__(value) def render(self) -> None | np.ndarray | str | list: if not self._has_reset: EnvLogger.error_render_before_reset() - self._has_rendered = True return super().render() def step(self, action: ActionType) -> None: @@ -90,7 +65,6 @@ def step(self, action: ActionType) -> None: elif not self.agents: self._has_updated = True EnvLogger.warn_step_after_terminated_truncated() - return None else: self._has_updated = True super().step(action) @@ -124,8 +98,7 @@ def __str__(self) -> str: if self.__class__ is OrderEnforcingWrapper else f"{type(self).__name__}<{str(self.env)}>" ) - else: - return repr(self) + return repr(self) class AECOrderEnforcingIterable(AECIterable[AgentID, ObsType, ActionType]): @@ -134,11 +107,16 @@ def __iter__(self) -> AECOrderEnforcingIterator[AgentID, ObsType, ActionType]: class AECOrderEnforcingIterator(AECIterator[AgentID, ObsType, ActionType]): + def __init__( + self, env: OrderEnforcingWrapper[AgentID, ObsType, ActionType], max_iter: int + ): + assert isinstance( + env, OrderEnforcingWrapper + ), "env must be wrapped by OrderEnforcingWrapper" + super().__init__(env, max_iter) + def __next__(self) -> AgentID: agent = super().__next__() - assert hasattr( - self.env, "_has_updated" - ), "env must be wrapped by OrderEnforcingWrapper" assert ( self.env._has_updated # pyright: ignore[reportGeneralTypeIssues] ), "need to call step() or reset() in a loop over `agent_iter`" diff --git a/pettingzoo/utils/wrappers/terminate_illegal.py b/pettingzoo/utils/wrappers/terminate_illegal.py index a49d9a0be..79f95504a 100644 --- a/pettingzoo/utils/wrappers/terminate_illegal.py +++ b/pettingzoo/utils/wrappers/terminate_illegal.py @@ -1,4 +1,3 @@ -# pyright reportGeneralTypeIssues=false from __future__ import annotations from pettingzoo.utils.env import ActionType, AECEnv, AgentID, ObsType @@ -20,6 +19,7 @@ def __init__( self._illegal_value = illegal_reward self._prev_obs = None self._prev_info = None + self._terminated = False # terminated by an illegal move def reset(self, seed: int | None = None, options: dict | None = None) -> None: self._terminated = False @@ -42,7 +42,6 @@ def step(self, action: ActionType) -> None: if self._prev_obs is None: self.observe(self.agent_selection) if isinstance(self._prev_obs, dict): - assert self._prev_obs is not None assert ( "action_mask" in self._prev_obs ), f"`action_mask` not found in dictionary observation: {self._prev_obs}. Action mask must either be in `observation['action_mask']` or `info['action_mask']` to use TerminateIllegalWrapper." @@ -60,7 +59,7 @@ def step(self, action: ActionType) -> None: self.terminations[self.agent_selection] or self.truncations[self.agent_selection] ): - self._was_dead_step(action) # pyright: ignore[reportGeneralTypeIssues] + self.env.unwrapped._was_dead_step(action) elif ( not self.terminations[self.agent_selection] and not self.truncations[self.agent_selection] @@ -70,12 +69,10 @@ def step(self, action: ActionType) -> None: self.env.unwrapped._cumulative_rewards[self.agent_selection] = 0 self.env.unwrapped.terminations = {d: True for d in self.agents} self.env.unwrapped.truncations = {d: True for d in self.agents} - self._prev_obs = None - self._prev_info = None self.env.unwrapped.rewards = {d: 0 for d in self.truncations} self.env.unwrapped.rewards[current_agent] = float(self._illegal_value) - self._accumulate_rewards() - self._deads_step_first() + self.env.unwrapped._accumulate_rewards() + self.env.unwrapped._deads_step_first() self._terminated = True else: super().step(action) diff --git a/pyproject.toml b/pyproject.toml index c0160ab17..73c99ea4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", 'Intended Audience :: Science/Research', 'Topic :: Scientific/Engineering :: Artificial Intelligence', ] @@ -31,34 +32,34 @@ dynamic = ["version"] [project.optional-dependencies] # Update dependencies in `all` if any are added or removed -atari = ["multi_agent_ale_py==0.1.11", "pygame==2.3.0"] +atari = ["multi_agent_ale_py>=0.1.11", "pygame>=2.3.0"] classic = [ - "chess==1.9.4", - "rlcard==1.0.5", - "pygame==2.3.0", + "chess>=1.9.4", + "rlcard>=1.0.5", + "pygame>=2.3.0", "shimmy[openspiel]>=1.2.0" ] -butterfly = ["pygame==2.3.0", "pymunk==6.2.0"] -mpe = ["pygame==2.3.0"] -sisl = ["pygame==2.3.0", "pymunk==6.2.0", "box2d-py==2.3.5", "scipy>=1.4.1"] +butterfly = ["pygame>=2.3.0", "pymunk>=6.2.0"] +mpe = ["pygame>=2.3.0"] +sisl = ["pygame>=2.3.0", "pymunk>=6.2.0", "box2d-py>=2.3.5", "scipy>=1.4.1"] other = ["pillow>=8.0.1"] testing = [ - "pynput==1.7.6", - "pytest==8.0.0", - "AutoROM==0.6.1", - "pytest-cov==4.1.0", - "pytest-xdist==3.5.0", - "pre-commit==3.5.0", - "pytest-markdown-docs==0.5.0" + "pynput>=1.7.6", + "pytest>=8.0.0", + "AutoROM>=0.6.1", + "pytest-cov>=4.1.0", + "pytest-xdist>=3.5.0", + "pre-commit>=3.5.0", + "pytest-markdown-docs>=0.5.0" ] all = [ - "multi_agent_ale_py==0.1.11", - "pygame==2.3.0", - "chess==1.9.4", - "rlcard==1.0.5", + "multi_agent_ale_py>=0.1.11", + "pygame>=2.3.0", + "chess>=1.9.4", + "rlcard>=1.0.5", "shimmy[openspiel]>=1.2.0", - "pymunk==6.2.0", - "box2d-py==2.3.5", + "pymunk>=6.2.0", + "box2d-py>=2.3.5", "scipy>=1.4.1", "pillow>=8.0.1", ] diff --git a/test/wrapper_test.py b/test/wrapper_test.py index 650fe328b..a03bd81b3 100644 --- a/test/wrapper_test.py +++ b/test/wrapper_test.py @@ -3,8 +3,13 @@ import pytest from pettingzoo.butterfly import pistonball_v6 -from pettingzoo.classic import texas_holdem_no_limit_v6 -from pettingzoo.utils.wrappers import MultiEpisodeEnv, MultiEpisodeParallelEnv +from pettingzoo.classic import texas_holdem_no_limit_v6, tictactoe_v3 +from pettingzoo.utils.wrappers import ( + BaseWrapper, + MultiEpisodeEnv, + MultiEpisodeParallelEnv, + TerminateIllegalWrapper, +) @pytest.mark.parametrize(("num_episodes"), [1, 2, 3, 4, 5, 6]) @@ -67,3 +72,65 @@ def test_multi_episode_parallel_env_wrapper(num_episodes) -> None: assert ( steps == num_episodes * 125 ), f"Expected to have 125 steps per episode, got {steps / num_episodes}." + + +def _do_game(env: TerminateIllegalWrapper, seed: int) -> None: + """Run a single game with reproducible random moves.""" + assert isinstance( + env, TerminateIllegalWrapper + ), "test_terminate_illegal must use TerminateIllegalWrapper" + env.reset(seed) + for agent in env.agents: + # make the random moves reproducible + env.action_space(agent).seed(seed) + + for agent in env.agent_iter(): + _, _, termination, truncation, _ = env.last() + + if termination or truncation: + env.step(None) + else: + action = env.action_space(agent).sample() + env.step(action) + + +def test_terminate_illegal() -> None: + """Test for a problem with terminate illegal wrapper. + + The problem is that env variables, including agent_selection, are set by + calls from TerminateIllegalWrapper to env functions. However, they are + called by the wrapper object, not the env so they are set in the wrapper + object rather than the base env object. When the code later tries to run, + the values get updated in the env code, but the wrapper pulls it's own + values that shadow them. + + The test here confirms that is fixed. + """ + # not using env() because we need to ensure that the env is + # wrapped by TerminateIllegalWrapper + raw_env = tictactoe_v3.raw_env() + env = TerminateIllegalWrapper(raw_env, illegal_reward=-1) + + _do_game(env, 42) + # bug is triggered by a corrupted state after a game is terminated + # due to an illegal move. So we need to run the game twice to + # see the effect. + _do_game(env, 42) + + # get a list of what all the agent_selection values in the wrapper stack + unwrapped = env + agent_selections = [] + while unwrapped != env.unwrapped: + # the actual value for this wrapper (or None if no value) + agent_selections.append(unwrapped.__dict__.get("agent_selection", None)) + assert isinstance(unwrapped, BaseWrapper) + unwrapped = unwrapped.env + + # last one from the actual env + agent_selections.append(unwrapped.__dict__.get("agent_selection", None)) + + # remove None from agent_selections + agent_selections = [x for x in agent_selections if x is not None] + + # all values must be the same, or else the wrapper and env are mismatched + assert len(set(agent_selections)) == 1, "agent_selection mismatch" diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py index a56464c5f..6b9ec9770 100644 --- a/tutorials/AgileRL/agilerl_dqn_curriculum.py +++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py @@ -2,6 +2,7 @@ Author: Nick (https://github.com/nicku-a) """ + import copy import os import random @@ -15,7 +16,7 @@ from agilerl.components.replay_buffer import ReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population from tqdm import tqdm, trange from pettingzoo.classic import connect_four_v3 @@ -66,27 +67,25 @@ def fill_replay_buffer(self, memory, opponent): while not (done or truncation): # Player 0's turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip(observation, player=0) if opponent_first: p0_action = self.env.action_space("player_0").sample(p0_action_mask) else: if self.lesson["warm_up_opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, self.lesson["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) self.step(p0_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p0_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0) - p0_next_state = np.expand_dims(p0_next_state, 0) + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player=0 + ) if done or truncation: reward = self.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -110,7 +109,7 @@ def fill_replay_buffer(self, memory, opponent): else: # Play continues if p1_state is not None: reward = self.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -120,31 +119,29 @@ def fill_replay_buffer(self, memory, opponent): # Player 1's turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) + p1_state, p1_state_flipped = transform_and_flip( + observation, player=1 + ) if not opponent_first: p1_action = self.env.action_space("player_1").sample( p1_action_mask ) else: if self.lesson["warm_up_opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"] ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) self.step(p1_action) # Act in environment observation, env_reward, done, truncation, _ = self.last() - p1_next_state = np.moveaxis(observation["observation"], [-1], [-3]) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0) - p1_next_state = np.expand_dims(p1_next_state, 0) + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player=1 + ) if done or truncation: reward = self.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( (p0_state, p1_state, p0_state_flipped, p1_state_flipped) ), @@ -168,7 +165,7 @@ def fill_replay_buffer(self, memory, opponent): else: # Play continues reward = self.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -323,11 +320,11 @@ def __init__(self, env, difficulty): self.env = env.env self.difficulty = difficulty if self.difficulty == "random": - self.getAction = self.random_opponent + self.get_action = self.random_opponent elif self.difficulty == "weak": - self.getAction = self.weak_rule_based_opponent + self.get_action = self.weak_rule_based_opponent else: - self.getAction = self.strong_rule_based_opponent + self.get_action = self.strong_rule_based_opponent self.num_cols = 7 self.num_rows = 6 self.length = 4 @@ -482,6 +479,25 @@ def outcome(self, action, player, return_length=False): return (True, reward, ended) + ((lengths,) if return_length else ()) +def transform_and_flip(observation, player): + """Transforms and flips observation for input to agent's neural network. + + :param observation: Observation to preprocess + :type observation: dict[str, np.ndarray] + :param player: Player, 0 or 1 + :type player: int + """ + state = observation["observation"] + # Pre-process dimensions for PyTorch (N, C, H, W) + state = np.moveaxis(state, [-1], [-3]) + if player == 1: + # Swap pieces so that the agent always sees the board from the same perspective + state[[0, 1], :, :] = state[[1, 0], :, :] + state_flipped = np.expand_dims(np.flip(state, 2), 0) + state = np.expand_dims(state, 0) + return state, state_flipped + + if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("===== AgileRL Curriculum Learning Demo =====") @@ -522,7 +538,6 @@ def outcome(self, action, player, return_length=False): "NUM_ATOMS": 51, # Unit number of support "V_MIN": 0.0, # Minimum value of support "V_MAX": 200.0, # Maximum value of support - "WANDB": False, # Use Weights and Biases tracking } # Define the connect four environment @@ -549,7 +564,7 @@ def outcome(self, action, player, return_length=False): action_dim = action_dim[0] # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -563,7 +578,6 @@ def outcome(self, action, player, return_length=False): # Configure the replay buffer field_names = ["state", "action", "reward", "next_state", "done"] memory = ReplayBuffer( - action_dim=action_dim, # Number of agent actions memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size field_names=field_names, # Field names to store in memory device=device, @@ -574,8 +588,8 @@ def outcome(self, action, player, return_length=False): tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -606,12 +620,7 @@ def outcome(self, action, player, return_length=False): # Define training loop parameters episodes_per_epoch = 10 - - # ! NOTE: Uncomment the max_episodes line below to change the number of training episodes. ! # - # It is deliberately set low to allow testing to ensure this tutorial is sound. - max_episodes = 10 - # max_episodes = LESSON["max_train_episodes"] # Total episodes - + max_episodes = LESSON["max_train_episodes"] # Total episodes max_steps = 500 # Maximum steps to take in each episode evo_epochs = 20 # Evolution frequency evo_loop = 50 # Number of evaluation episodes @@ -620,12 +629,11 @@ def outcome(self, action, player, return_length=False): eps_end = 0.1 # Final epsilon value eps_decay = 0.9998 # Epsilon decays opp_update_counter = 0 - wb = INIT_HP["WANDB"] if LESSON["pretrained_path"] is not None: for agent in pop: # Load pretrained checkpoint - agent.loadCheckpoint(LESSON["pretrained_path"]) + agent.load_checkpoint(LESSON["pretrained_path"]) # Reinit optimizer for new task agent.lr = INIT_HP["LR"] agent.optimizer = torch.optim.Adam( @@ -659,24 +667,23 @@ def outcome(self, action, player, return_length=False): print("Agent population warmed up.") if max_episodes > 0: - if wb: - wandb.init( - # set the wandb project where this run will be logged - project="AgileRL", - name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( - "connect_four_v3", - INIT_HP["ALGO"], - LESSON["opponent"], - datetime.now().strftime("%m%d%Y%H%M%S"), - ), - # track hyperparameters and run metadata - config={ - "algo": "Evo HPO Rainbow DQN", - "env": "connect_four_v3", - "INIT_HP": INIT_HP, - "lesson": LESSON, - }, - ) + wandb.init( + # set the wandb project where this run will be logged + project="AgileRL", + name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format( + "connect_four_v3", + INIT_HP["ALGO"], + LESSON["opponent"], + datetime.now().strftime("%m%d%Y%H%M%S"), + ), + # track hyperparameters and run metadata + config={ + "algo": "Evo HPO Rainbow DQN", + "env": "connect_four_v3", + "INIT_HP": INIT_HP, + "lesson": LESSON, + }, + ) total_steps = 0 total_episodes = 0 @@ -689,7 +696,7 @@ def outcome(self, action, player, return_length=False): for agent in pop: # Loop through population for episode in range(episodes_per_epoch): env.reset() # Reset environment at start of episode - observation, env_reward, done, truncation, _ = env.last() + observation, cumulative_reward, done, truncation, _ = env.last() ( p1_state, @@ -718,23 +725,23 @@ def outcome(self, action, player, return_length=False): for idx_step in range(max_steps): # Player 0"s turn p0_action_mask = observation["action_mask"] - p0_state = np.moveaxis(observation["observation"], [-1], [-3]) - p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0) - p0_state = np.expand_dims(p0_state, 0) + p0_state, p0_state_flipped = transform_and_flip( + observation, player=0 + ) if opponent_first: if LESSON["opponent"] == "self": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_state, 0, p0_action_mask )[0] elif LESSON["opponent"] == "random": - p0_action = opponent.getAction( + p0_action = opponent.get_action( p0_action_mask, p1_action, LESSON["block_vert_coef"] ) else: - p0_action = opponent.getAction(player=0) + p0_action = opponent.get_action(player=0) else: - p0_action = agent.getAction( + p0_action = agent.get_action( p0_state, epsilon, p0_action_mask )[ 0 @@ -742,23 +749,18 @@ def outcome(self, action, player, return_length=False): train_actions_hist[p0_action] += 1 env.step(p0_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p0_next_state = np.moveaxis( - observation["observation"], [-1], [-3] + observation, cumulative_reward, done, truncation, _ = env.last() + p0_next_state, p0_next_state_flipped = transform_and_flip( + observation, player=0 ) - p0_next_state_flipped = np.expand_dims( - np.flip(p0_next_state, 2), 0 - ) - p0_next_state = np.expand_dims(p0_next_state, 0) - if not opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 0 win) if done or truncation: reward = env.reward(done=True, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -787,7 +789,7 @@ def outcome(self, action, player, return_length=False): else: # Play continues if p1_state is not None: reward = env.reward(done=False, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p1_state, p1_state_flipped)), [p1_action, 6 - p1_action], [reward, reward], @@ -799,29 +801,25 @@ def outcome(self, action, player, return_length=False): # Player 1"s turn p1_action_mask = observation["action_mask"] - p1_state = np.moveaxis( - observation["observation"], [-1], [-3] + p1_state, p1_state_flipped = transform_and_flip( + observation, player=1 ) - # Swap pieces so that the agent always sees the board from the same perspective - p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :] - p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0) - p1_state = np.expand_dims(p1_state, 0) if not opponent_first: if LESSON["opponent"] == "self": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_state, 0, p1_action_mask )[0] elif LESSON["opponent"] == "random": - p1_action = opponent.getAction( + p1_action = opponent.get_action( p1_action_mask, p0_action, LESSON["block_vert_coef"], ) else: - p1_action = opponent.getAction(player=1) + p1_action = opponent.get_action(player=1) else: - p1_action = agent.getAction( + p1_action = agent.get_action( p1_state, epsilon, p1_action_mask )[ 0 @@ -829,24 +827,25 @@ def outcome(self, action, player, return_length=False): train_actions_hist[p1_action] += 1 env.step(p1_action) # Act in environment - observation, env_reward, done, truncation, _ = env.last() - p1_next_state = np.moveaxis( - observation["observation"], [-1], [-3] - ) - p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :] - p1_next_state_flipped = np.expand_dims( - np.flip(p1_next_state, 2), 0 + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() + p1_next_state, p1_next_state_flipped = transform_and_flip( + observation, player=1 ) - p1_next_state = np.expand_dims(p1_next_state, 0) if opponent_first: - score += env_reward + score = cumulative_reward turns += 1 # Check if game is over (Player 1 win) if done or truncation: reward = env.reward(done=True, player=1) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate( ( p0_state, @@ -880,7 +879,7 @@ def outcome(self, action, player, return_length=False): else: # Play continues reward = env.reward(done=False, player=0) - memory.save2memoryVectEnvs( + memory.save_to_memory_vect_envs( np.concatenate((p0_state, p0_state_flipped)), [p0_action, 6 - p0_action], [reward, reward], @@ -935,7 +934,13 @@ def outcome(self, action, player, return_length=False): rewards = [] for i in range(evo_loop): env.reset() # Reset environment at start of episode - observation, reward, done, truncation, _ = env.last() + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() player = -1 # Tracker for which player"s turn it is @@ -955,42 +960,52 @@ def outcome(self, action, player, return_length=False): if player < 0: if opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action( + state, 0, action_mask + )[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 if player > 0: if not opponent_first: if LESSON["eval_opponent"] == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: state = np.moveaxis( observation["observation"], [-1], [-3] ) - state[[0, 1], :, :] = state[[0, 1], :, :] + state[[0, 1], :, :] = state[[1, 0], :, :] state = np.expand_dims(state, 0) - action = agent.getAction(state, 0, action_mask)[ + action = agent.get_action( + state, 0, action_mask + )[ 0 ] # Get next action from agent eval_actions_hist[action] += 1 env.step(action) # Act in environment - observation, reward, done, truncation, _ = env.last() + ( + observation, + cumulative_reward, + done, + truncation, + _, + ) = env.last() if (player > 0 and opponent_first) or ( player < 0 and not opponent_first ): - score += reward + score = cumulative_reward eval_turns += 1 @@ -1027,31 +1042,29 @@ def outcome(self, action, player, return_length=False): for index, action in enumerate(eval_actions_hist) } - if wb: - wandb_dict = { - "global_step": total_steps, - "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), - "train/mean_turns_per_game": mean_turns, - "train/epsilon": epsilon, - "train/opponent_updates": opp_update_counter, - "eval/mean_fitness": np.mean(fitnesses), - "eval/best_fitness": np.max(fitnesses), - "eval/mean_turns_per_game": eval_turns, - } - wandb_dict.update(train_actions_dict) - wandb_dict.update(eval_actions_dict) - wandb.log(wandb_dict) + wandb_dict = { + "global_step": total_steps, + "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]), + "train/mean_turns_per_game": mean_turns, + "train/epsilon": epsilon, + "train/opponent_updates": opp_update_counter, + "eval/mean_fitness": np.mean(fitnesses), + "eval/best_fitness": np.max(fitnesses), + "eval/mean_turns_per_game": eval_turns, + } + wandb_dict.update(train_actions_dict) + wandb_dict.update(eval_actions_dict) + wandb.log(wandb_dict) # Tournament selection and population mutation elite, pop = tournament.select(pop) pop = mutations.mutation(pop) if max_episodes > 0: - if wb: - wandb.finish() + wandb.finish() # Save the trained agent save_path = LESSON["save_path"] os.makedirs(os.path.dirname(save_path), exist_ok=True) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) print(f"Elite agent saved to '{save_path}'.") diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py index 37e193f40..99d19e17c 100644 --- a/tutorials/AgileRL/agilerl_maddpg.py +++ b/tutorials/AgileRL/agilerl_maddpg.py @@ -2,22 +2,22 @@ Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a) """ + import os +from copy import deepcopy import numpy as np import supersuit as ss import torch from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer -from agilerl.hpo.mutation import Mutations -from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population +from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv from tqdm import trange from pettingzoo.atari import space_invaders_v2 if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print("===== AgileRL MADDPG Demo =====") # Define the network configuration NET_CONFIG = { @@ -31,47 +31,59 @@ # Define the initial hyperparameters INIT_HP = { - "POPULATION_SIZE": 2, + "POPULATION_SIZE": 1, "ALGO": "MADDPG", # Algorithm # Swap image channels dimension from last to first [H, W, C] -> [C, H, W] "CHANNELS_LAST": True, - "BATCH_SIZE": 8, # Batch size + "BATCH_SIZE": 32, # Batch size + "O_U_NOISE": True, # Ornstein Uhlenbeck action noise + "EXPL_NOISE": 0.1, # Action noise scale + "MEAN_NOISE": 0.0, # Mean action noise + "THETA": 0.15, # Rate of mean reversion in OU noise + "DT": 0.01, # Timestep for OU noise "LR_ACTOR": 0.001, # Actor learning rate - "LR_CRITIC": 0.01, # Critic learning rate + "LR_CRITIC": 0.001, # Critic learning rate "GAMMA": 0.95, # Discount factor - "MEMORY_SIZE": 10000, # Max memory buffer size - "LEARN_STEP": 5, # Learning frequency + "MEMORY_SIZE": 100000, # Max memory buffer size + "LEARN_STEP": 100, # Learning frequency "TAU": 0.01, # For soft update of target parameters } + num_envs = 8 # Define the space invaders environment as a parallel environment env = space_invaders_v2.parallel_env() - if INIT_HP["CHANNELS_LAST"]: - # Environment processing for image based observations - env = ss.frame_skip_v0(env, 4) - env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1) - env = ss.color_reduction_v0(env, mode="B") - env = ss.resize_v1(env, x_size=84, y_size=84) - env = ss.frame_stack_v1(env, 4) + + # Environment processing for image based observations + env = ss.frame_skip_v0(env, 4) + env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1) + env = ss.color_reduction_v0(env, mode="B") + env = ss.resize_v1(env, x_size=84, y_size=84) + env = ss.frame_stack_v1(env, 4) + env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)]) + env.reset() # Configure the multi-agent algo input arguments try: - state_dim = [env.observation_space(agent).n for agent in env.agents] + state_dim = [env.single_observation_space(agent).n for agent in env.agents] one_hot = True except Exception: - state_dim = [env.observation_space(agent).shape for agent in env.agents] + state_dim = [env.single_observation_space(agent).shape for agent in env.agents] one_hot = False try: - action_dim = [env.action_space(agent).n for agent in env.agents] + action_dim = [env.single_action_space(agent).n for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = True INIT_HP["MAX_ACTION"] = None INIT_HP["MIN_ACTION"] = None except Exception: - action_dim = [env.action_space(agent).shape[0] for agent in env.agents] + action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = False - INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents] - INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents] + INIT_HP["MAX_ACTION"] = [ + env.single_action_space(agent).high for agent in env.agents + ] + INIT_HP["MIN_ACTION"] = [ + env.single_action_space(agent).low for agent in env.agents + ] # Pre-process image dimensions for pytorch convolutional layers if INIT_HP["CHANNELS_LAST"]: @@ -84,7 +96,7 @@ INIT_HP["AGENT_IDS"] = env.agents # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + agent = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -92,8 +104,9 @@ NET_CONFIG, INIT_HP, population_size=INIT_HP["POPULATION_SIZE"], + num_envs=num_envs, device=device, - ) + )[0] # Configure the multi-agent replay buffer field_names = ["state", "action", "reward", "next_state", "done"] @@ -104,152 +117,138 @@ device=device, ) - # Instantiate a tournament selection object (used for HPO) - tournament = TournamentSelection( - tournament_size=2, # Tournament selection size - elitism=True, # Elitism in tournament selection - population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores - - # Instantiate a mutations object (used for HPO) - mutations = Mutations( - algo=INIT_HP["ALGO"], - no_mutation=0.2, # Probability of no mutation - architecture=0.2, # Probability of architecture mutation - new_layer_prob=0.2, # Probability of new layer mutation - parameters=0.2, # Probability of parameter mutation - activation=0, # Probability of activation function mutation - rl_hp=0.2, # Probability of RL hyperparameter mutation - rl_hp_selection=[ - "lr", - "learn_step", - "batch_size", - ], # RL hyperparams selected for mutation - mutation_sd=0.1, # Mutation strength - # Define search space for each hyperparameter - min_lr=0.0001, - max_lr=0.01, - min_learn_step=1, - max_learn_step=120, - min_batch_size=8, - max_batch_size=64, - agent_ids=INIT_HP["AGENT_IDS"], # Agent IDs - arch=NET_CONFIG["arch"], # MLP or CNN - rand_seed=1, - device=device, - ) - # Define training loop parameters - max_episodes = 5 # Total episodes (default: 6000) - max_steps = 900 # Maximum steps to take in each episode - epsilon = 1.0 # Starting epsilon value - eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay - evo_epochs = 20 # Evolution frequency - evo_loop = 1 # Number of evaluation episodes - elite = pop[0] # Assign a placeholder "elite" agent - - # Training loop - for idx_epi in trange(max_episodes): - for agent in pop: # Loop through population - state, info = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} + agent_ids = deepcopy(env.agents) + max_steps = 20000 # Max steps (default: 2000000) + learning_delay = 500 # Steps before starting learning + training_steps = 10000 # Frequency at which we evaluate training score + eval_steps = None # Evaluation steps per episode - go until done + eval_loop = 1 # Number of evaluation episodes + + total_steps = 0 + + # TRAINING LOOP + print("Training...") + pbar = trange(max_steps, unit="step") + while np.less(agent.steps[-1], max_steps): + state, info = env.reset() # Reset environment at start of episode + scores = np.zeros((num_envs, len(agent_ids))) + completed_episode_scores = [] + steps = 0 + if INIT_HP["CHANNELS_LAST"]: + state = { + agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items() + } + + for idx_step in range(training_steps // num_envs): + # Get next action from agent + cont_actions, discrete_action = agent.get_action( + states=state, training=True, infos=info + ) + if agent.discrete_actions: + action = discrete_action + else: + action = cont_actions + + # Act in environment + action = {agent: env.action_space(agent).sample() for agent in env.agents} + next_state, reward, termination, truncation, info = env.step(action) + if not termination: + assert False + scores += np.array(list(reward.values())).transpose() + total_steps += num_envs + steps += num_envs + + # Image processing if necessary for the environment if INIT_HP["CHANNELS_LAST"]: - state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3]) - for agent_id, s in state.items() + next_state = { + agent_id: np.moveaxis(ns, [-1], [-3]) + for agent_id, ns in next_state.items() } - for _ in range(max_steps): - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - - # Get next action from agent - cont_actions, discrete_action = agent.getAction( - state, epsilon, agent_mask, env_defined_actions - ) - if agent.discrete_actions: - action = discrete_action - else: - action = cont_actions - - next_state, reward, termination, truncation, info = env.step( - action - ) # Act in environment - - # Image processing if necessary for the environment - if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} - next_state = { - agent_id: np.moveaxis(ns, [-1], [-3]) - for agent_id, ns in next_state.items() - } - - # Save experiences to replay buffer - memory.save2memory(state, cont_actions, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r - - # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size + + # Save experiences to replay buffer + memory.save_to_memory( + state, + cont_actions, + reward, + next_state, + termination, + is_vectorised=True, + ) + + # Learn according to learning frequency + # Handle learn steps > num_envs + if agent.learn_step > num_envs: + learn_step = agent.learn_step // num_envs + if ( + idx_step % learn_step == 0 + and len(memory) >= agent.batch_size + and memory.counter > learning_delay ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm - - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } - state = next_state - - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, - ) - for agent in pop - ] - - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + # Handle num_envs > learn step; learn multiple times per step in env + elif len(memory) >= agent.batch_size and memory.counter > learning_delay: + for _ in range(num_envs // agent.learn_step): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + + state = next_state + + # Calculate scores and reset noise for finished episodes + reset_noise_indices = [] + term_array = np.array(list(termination.values())).transpose() + trunc_array = np.array(list(truncation.values())).transpose() + for idx, (d, t) in enumerate(zip(term_array, trunc_array)): + if np.any(d) or np.any(t): + completed_episode_scores.append(scores[idx]) + agent.scores.append(scores[idx]) + scores[idx] = 0 + reset_noise_indices.append(idx) + agent.reset_action_noise(reset_noise_indices) + + pbar.update(training_steps) + + agent.steps[-1] += steps + + # Evaluate population + fitness = agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=eval_steps, + loop=eval_loop, + sum_scores=False, + ) + pop_episode_scores = np.array(completed_episode_scores) + mean_scores = np.mean(pop_episode_scores, axis=0) + + print(f"--- Global steps {total_steps} ---") + print(f"Steps {agent.steps[-1]}") + print("Scores:") + for idx, sub_agent in enumerate(agent_ids): + print(f" {sub_agent} score: {mean_scores[idx]}") + print("Fitness") + for idx, sub_agent in enumerate(agent_ids): + print(f" {sub_agent} fitness: {fitness[idx]}") + print("Previous 5 fitness avgs") + for idx, sub_agent in enumerate(agent_ids): print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' + f" {sub_agent} fitness average: {np.mean(agent.fitness[-5:], axis=0)[idx]}" ) - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) + # Update step counter + agent.steps.append(agent.steps[-1]) # Save the trained algorithm path = "./models/MADDPG" filename = "MADDPG_trained_agent.pt" os.makedirs(path, exist_ok=True) save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + agent.save_checkpoint(save_path) + + pbar.close() + env.close() diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py index cc6ed9009..11335b45a 100644 --- a/tutorials/AgileRL/agilerl_matd3.py +++ b/tutorials/AgileRL/agilerl_matd3.py @@ -2,6 +2,7 @@ Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a) """ + import os import numpy as np @@ -9,14 +10,15 @@ from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer from agilerl.hpo.mutation import Mutations from agilerl.hpo.tournament import TournamentSelection -from agilerl.utils.utils import initialPopulation +from agilerl.utils.utils import create_population +from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv from tqdm import trange from pettingzoo.mpe import simple_speaker_listener_v4 if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - print("===== AgileRL MATD3 Demo =====") + print("===== AgileRL Online Multi-Agent Demo =====") # Define the network configuration NET_CONFIG = { @@ -31,36 +33,47 @@ # Swap image channels dimension from last to first [H, W, C] -> [C, H, W] "CHANNELS_LAST": False, "BATCH_SIZE": 32, # Batch size + "O_U_NOISE": True, # Ornstein Uhlenbeck action noise + "EXPL_NOISE": 0.1, # Action noise scale + "MEAN_NOISE": 0.0, # Mean action noise + "THETA": 0.15, # Rate of mean reversion in OU noise + "DT": 0.01, # Timestep for OU noise "LR_ACTOR": 0.001, # Actor learning rate - "LR_CRITIC": 0.01, # Critic learning rate + "LR_CRITIC": 0.001, # Critic learning rate "GAMMA": 0.95, # Discount factor "MEMORY_SIZE": 100000, # Max memory buffer size - "LEARN_STEP": 5, # Learning frequency + "LEARN_STEP": 100, # Learning frequency "TAU": 0.01, # For soft update of target parameters "POLICY_FREQ": 2, # Policy frequnecy } + num_envs = 8 # Define the simple speaker listener environment as a parallel environment env = simple_speaker_listener_v4.parallel_env(continuous_actions=True) + env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)]) env.reset() # Configure the multi-agent algo input arguments try: - state_dim = [env.observation_space(agent).n for agent in env.agents] + state_dim = [env.single_observation_space(agent).n for agent in env.agents] one_hot = True except Exception: - state_dim = [env.observation_space(agent).shape for agent in env.agents] + state_dim = [env.single_observation_space(agent).shape for agent in env.agents] one_hot = False try: - action_dim = [env.action_space(agent).n for agent in env.agents] + action_dim = [env.single_action_space(agent).n for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = True INIT_HP["MAX_ACTION"] = None INIT_HP["MIN_ACTION"] = None except Exception: - action_dim = [env.action_space(agent).shape[0] for agent in env.agents] + action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents] INIT_HP["DISCRETE_ACTIONS"] = False - INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents] - INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents] + INIT_HP["MAX_ACTION"] = [ + env.single_action_space(agent).high for agent in env.agents + ] + INIT_HP["MIN_ACTION"] = [ + env.single_action_space(agent).low for agent in env.agents + ] # Not applicable to MPE environments, used when images are used for observations (Atari environments) if INIT_HP["CHANNELS_LAST"]: @@ -73,7 +86,7 @@ INIT_HP["AGENT_IDS"] = env.agents # Create a population ready for evolutionary hyper-parameter optimisation - pop = initialPopulation( + pop = create_population( INIT_HP["ALGO"], state_dim, action_dim, @@ -81,6 +94,7 @@ NET_CONFIG, INIT_HP, population_size=INIT_HP["POPULATION_SIZE"], + num_envs=num_envs, device=device, ) @@ -98,8 +112,8 @@ tournament_size=2, # Tournament selection size elitism=True, # Elitism in tournament selection population_size=INIT_HP["POPULATION_SIZE"], # Population size - evo_step=1, - ) # Evaluate using last N fitness scores + eval_loop=1, # Evaluate using last N fitness scores + ) # Instantiate a mutations object (used for HPO) mutations = Mutations( @@ -123,116 +137,148 @@ ) # Define training loop parameters - max_episodes = 500 # Total episodes (default: 6000) - max_steps = 25 # Maximum steps to take in each episode - epsilon = 1.0 # Starting epsilon value - eps_end = 0.1 # Final epsilon value - eps_decay = 0.995 # Epsilon decay - evo_epochs = 20 # Evolution frequency - evo_loop = 1 # Number of evaluation episodes + max_steps = 13000 # Max steps (default: 2000000) + learning_delay = 0 # Steps before starting learning + evo_steps = 1000 # Evolution frequency + eval_steps = None # Evaluation steps per episode - go until done + eval_loop = 1 # Number of evaluation episodes elite = pop[0] # Assign a placeholder "elite" agent - # Training loop - for idx_epi in trange(max_episodes): + total_steps = 0 + + # TRAINING LOOP + print("Training...") + pbar = trange(max_steps, unit="step") + while np.less([agent.steps[-1] for agent in pop], max_steps).all(): + pop_episode_scores = [] for agent in pop: # Loop through population state, info = env.reset() # Reset environment at start of episode - agent_reward = {agent_id: 0 for agent_id in env.agents} + scores = np.zeros(num_envs) + completed_episode_scores = [] + steps = 0 if INIT_HP["CHANNELS_LAST"]: state = { - agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3]) + agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items() } - for _ in range(max_steps): - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - + for idx_step in range(evo_steps // num_envs): # Get next action from agent - cont_actions, discrete_action = agent.getAction( - state, epsilon, agent_mask, env_defined_actions + cont_actions, discrete_action = agent.get_action( + states=state, training=True, infos=info ) if agent.discrete_actions: action = discrete_action else: action = cont_actions - next_state, reward, termination, truncation, info = env.step( - action - ) # Act in environment + # Act in environment + next_state, reward, termination, truncation, info = env.step(action) + + scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1) + total_steps += num_envs + steps += num_envs # Image processing if necessary for the environment if INIT_HP["CHANNELS_LAST"]: - state = {agent_id: np.squeeze(s) for agent_id, s in state.items()} next_state = { agent_id: np.moveaxis(ns, [-1], [-3]) for agent_id, ns in next_state.items() } # Save experiences to replay buffer - memory.save2memory(state, cont_actions, reward, next_state, termination) - - # Collect the reward - for agent_id, r in reward.items(): - agent_reward[agent_id] += r + memory.save_to_memory( + state, + cont_actions, + reward, + next_state, + termination, + is_vectorised=True, + ) # Learn according to learning frequency - if (memory.counter % agent.learn_step == 0) and ( - len(memory) >= agent.batch_size + # Handle learn steps > num_envs + if agent.learn_step > num_envs: + learn_step = agent.learn_step // num_envs + if ( + idx_step % learn_step == 0 + and len(memory) >= agent.batch_size + and memory.counter > learning_delay + ): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) + # Handle num_envs > learn step; learn multiple times per step in env + elif ( + len(memory) >= agent.batch_size and memory.counter > learning_delay ): - experiences = memory.sample( - agent.batch_size - ) # Sample replay buffer - agent.learn(experiences) # Learn according to agent's RL algorithm + for _ in range(num_envs // agent.learn_step): + # Sample replay buffer + experiences = memory.sample(agent.batch_size) + # Learn according to agent's RL algorithm + agent.learn(experiences) - # Update the state - if INIT_HP["CHANNELS_LAST"]: - next_state = { - agent_id: np.expand_dims(ns, 0) - for agent_id, ns in next_state.items() - } state = next_state - # Stop episode if any agents have terminated - if any(truncation.values()) or any(termination.values()): - break - - # Save the total episode reward - score = sum(agent_reward.values()) - agent.scores.append(score) - - # Update epsilon for exploration - epsilon = max(eps_end, epsilon * eps_decay) - - # Now evolve population if necessary - if (idx_epi + 1) % evo_epochs == 0: - # Evaluate population - fitnesses = [ - agent.test( - env, - swap_channels=INIT_HP["CHANNELS_LAST"], - max_steps=max_steps, - loop=evo_loop, - ) - for agent in pop - ] + # Calculate scores and reset noise for finished episodes + reset_noise_indices = [] + term_array = np.array(list(termination.values())).transpose() + trunc_array = np.array(list(truncation.values())).transpose() + for idx, (d, t) in enumerate(zip(term_array, trunc_array)): + if np.any(d) or np.any(t): + completed_episode_scores.append(scores[idx]) + agent.scores.append(scores[idx]) + scores[idx] = 0 + reset_noise_indices.append(idx) + agent.reset_action_noise(reset_noise_indices) - print(f"Episode {idx_epi + 1}/{max_episodes}") - print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}') - print( - f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}' + pbar.update(evo_steps // len(pop)) + + agent.steps[-1] += steps + pop_episode_scores.append(completed_episode_scores) + + # Evaluate population + fitnesses = [ + agent.test( + env, + swap_channels=INIT_HP["CHANNELS_LAST"], + max_steps=eval_steps, + loop=eval_loop, ) + for agent in pop + ] + mean_scores = [ + ( + np.mean(episode_scores) + if len(episode_scores) > 0 + else "0 completed episodes" + ) + for episode_scores in pop_episode_scores + ] + + print(f"--- Global steps {total_steps} ---") + print(f"Steps {[agent.steps[-1] for agent in pop]}") + print(f"Scores: {mean_scores}") + print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}') + print( + f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}' + ) - # Tournament selection and population mutation - elite, pop = tournament.select(pop) - pop = mutations.mutation(pop) + # Tournament selection and population mutation + elite, pop = tournament.select(pop) + pop = mutations.mutation(pop) + + # Update step counter + for agent in pop: + agent.steps.append(agent.steps[-1]) # Save the trained algorithm path = "./models/MATD3" filename = "MATD3_trained_agent.pt" os.makedirs(path, exist_ok=True) save_path = os.path.join(path, filename) - elite.saveCheckpoint(save_path) + elite.save_checkpoint(save_path) + + pbar.close() + env.close() diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py index f5a2d4b38..67d3ad9cc 100644 --- a/tutorials/AgileRL/render_agilerl_dqn.py +++ b/tutorials/AgileRL/render_agilerl_dqn.py @@ -4,7 +4,7 @@ import numpy as np import torch from agilerl.algorithms.dqn import DQN -from agilerl_dqn_curriculum import Opponent +from agilerl_dqn_curriculum import Opponent, transform_and_flip from PIL import Image, ImageDraw, ImageFont from pettingzoo.classic import connect_four_v3 @@ -68,16 +68,8 @@ def resize_frames(frames, fraction): state_dim = np.zeros(state_dim[0]).flatten().shape action_dim = action_dim[0] - # Instantiate an DQN object - dqn = DQN( - state_dim, - action_dim, - one_hot, - device=device, - ) - - # Load the saved algorithm into the DQN object - dqn.loadCheckpoint(path) + # Load the saved agent + dqn = DQN.load(path, device) for opponent_difficulty in ["random", "weak", "strong", "self"]: # Create opponent @@ -120,38 +112,35 @@ def resize_frames(frames, fraction): for idx_step in range(max_steps): action_mask = observation["action_mask"] if player < 0: - state = np.moveaxis(observation["observation"], [-1], [-3]) - state = np.expand_dims(state, 0) + state, _ = transform_and_flip(observation, player=0) if opponent_first: if opponent_difficulty == "self": - action = opponent.getAction( + action = opponent.get_action( state, epsilon=0, action_mask=action_mask )[0] elif opponent_difficulty == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=0) + action = opponent.get_action(player=0) else: - action = dqn.getAction( + action = dqn.get_action( state, epsilon=0, action_mask=action_mask )[ 0 ] # Get next action from agent if player > 0: - state = np.moveaxis(observation["observation"], [-1], [-3]) - state[[0, 1], :, :] = state[[0, 1], :, :] - state = np.expand_dims(state, 0) + state, _ = transform_and_flip(observation, player=1) if not opponent_first: if opponent_difficulty == "self": - action = opponent.getAction( + action = opponent.get_action( state, epsilon=0, action_mask=action_mask )[0] elif opponent_difficulty == "random": - action = opponent.getAction(action_mask) + action = opponent.get_action(action_mask) else: - action = opponent.getAction(player=1) + action = opponent.get_action(player=1) else: - action = dqn.getAction( + action = dqn.get_action( state, epsilon=0, action_mask=action_mask )[ 0 diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py index ca47349d5..2713b48fd 100644 --- a/tutorials/AgileRL/render_agilerl_maddpg.py +++ b/tutorials/AgileRL/render_agilerl_maddpg.py @@ -68,22 +68,9 @@ def _label_with_episode_number(frame, episode_num): n_agents = env.num_agents agent_ids = env.agents - # Instantiate an MADDPG object - maddpg = MADDPG( - state_dim, - action_dim, - one_hot, - n_agents, - agent_ids, - max_action, - min_action, - discrete_actions, - device=device, - ) - - # Load the saved algorithm into the MADDPG object + # Load the saved agent path = "./models/MADDPG/MADDPG_trained_agent.pt" - maddpg.loadCheckpoint(path) + maddpg = MADDPG.load(path, device) # Define test loop parameters episodes = 10 # Number of episodes to test agent on @@ -106,20 +93,9 @@ def _label_with_episode_number(frame, episode_num): agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1]) for agent_id, s in state.items() } - - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - # Get next action from agent - cont_actions, discrete_action = maddpg.getAction( - state, - epsilon=0, - agent_mask=agent_mask, - env_defined_actions=env_defined_actions, + cont_actions, discrete_action = maddpg.get_action( + state, training=False, infos=info ) if maddpg.discrete_actions: action = discrete_action @@ -131,7 +107,9 @@ def _label_with_episode_number(frame, episode_num): frames.append(_label_with_episode_number(frame, episode_num=ep)) # Take action in environment - state, reward, termination, truncation, info = env.step(action) + state, reward, termination, truncation, info = env.step( + {agent: a.squeeze() for agent, a in action.items()} + ) # Save agent's reward for this step in this episode for agent_id, r in reward.items(): diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py index efcc610cd..8bfae5673 100644 --- a/tutorials/AgileRL/render_agilerl_matd3.py +++ b/tutorials/AgileRL/render_agilerl_matd3.py @@ -55,22 +55,9 @@ def _label_with_episode_number(frame, episode_num): n_agents = env.num_agents agent_ids = env.agents - # Instantiate an MADDPG object - matd3 = MATD3( - state_dim, - action_dim, - one_hot, - n_agents, - agent_ids, - max_action, - min_action, - discrete_actions, - device=device, - ) - - # Load the saved algorithm into the MADDPG object + # Load the saved agent path = "./models/MATD3/MATD3_trained_agent.pt" - matd3.loadCheckpoint(path) + matd3 = MATD3.load(path, device) # Define test loop parameters episodes = 10 # Number of episodes to test agent on @@ -94,19 +81,9 @@ def _label_with_episode_number(frame, episode_num): agent_reward = {agent_id: 0 for agent_id in agent_ids} score = 0 for _ in range(max_steps): - agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None - env_defined_actions = ( - info["env_defined_actions"] - if "env_defined_actions" in info.keys() - else None - ) - # Get next action from agent - cont_actions, discrete_action = matd3.getAction( - state, - epsilon=0, - agent_mask=agent_mask, - env_defined_actions=env_defined_actions, + cont_actions, discrete_action = matd3.get_action( + state, training=False, infos=info ) if matd3.discrete_actions: action = discrete_action @@ -118,7 +95,9 @@ def _label_with_episode_number(frame, episode_num): frames.append(_label_with_episode_number(frame, episode_num=ep)) # Take action in environment - state, reward, termination, truncation, info = env.step(action) + state, reward, termination, truncation, info = env.step( + {agent: a.squeeze() for agent, a in action.items()} + ) # Save agent's reward for this step in this episode for agent_id, r in reward.items(): diff --git a/tutorials/CustomEnvironment/tutorial3_action_masking.py b/tutorials/CustomEnvironment/tutorial3_action_masking.py index 24676373f..c0dfe2170 100644 --- a/tutorials/CustomEnvironment/tutorial3_action_masking.py +++ b/tutorials/CustomEnvironment/tutorial3_action_masking.py @@ -193,7 +193,7 @@ def step(self, actions): def render(self): """Renders the environment.""" - grid = np.zeros((7, 7)) + grid = np.zeros((8, 8), dtype=object) grid[self.prisoner_y, self.prisoner_x] = "P" grid[self.guard_y, self.guard_x] = "G" grid[self.escape_y, self.escape_x] = "E" diff --git a/tutorials/SB3/connect_four/requirements.txt b/tutorials/SB3/connect_four/requirements.txt index bf7c59673..e8ed650ab 100644 --- a/tutorials/SB3/connect_four/requirements.txt +++ b/tutorials/SB3/connect_four/requirements.txt @@ -1,3 +1,4 @@ pettingzoo[classic]>=1.24.0 stable-baselines3>=2.0.0 sb3-contrib>=2.0.0 +gymnasium<=0.29.1 diff --git a/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py b/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py index d8d890362..e3dc63d34 100644 --- a/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py +++ b/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py @@ -9,6 +9,7 @@ import os import time +import gymnasium as gym from sb3_contrib import MaskablePPO from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy from sb3_contrib.common.wrappers import ActionMasker @@ -37,9 +38,23 @@ def reset(self, seed=None, options=None): return self.observe(self.agent_selection), {} def step(self, action): - """Gymnasium-like step function, returning observation, reward, termination, truncation, info.""" + """Gymnasium-like step function, returning observation, reward, termination, truncation, info. + + The observation is for the next agent (used to determine the next action), while the remaining + items are for the agent that just acted (used to understand what just happened). + """ + current_agent = self.agent_selection + super().step(action) - return super().last() + + next_agent = self.agent_selection + return ( + self.observe(next_agent), + self._cumulative_rewards[current_agent], + self.terminations[current_agent], + self.truncations[current_agent], + self.infos[current_agent], + ) def observe(self, agent): """Return only raw observation, removing action mask.""" @@ -160,6 +175,11 @@ def eval_action_mask(env_fn, num_games=100, render_mode=None, **env_kwargs): if __name__ == "__main__": + if gym.__version__ > "0.29.1": + raise ImportError( + f"This script requires gymnasium version 0.29.1 or lower, but you have version {gym.__version__}." + ) + env_fn = connect_four_v3 env_kwargs = {} diff --git a/tutorials/SB3/test/test_sb3_action_mask.py b/tutorials/SB3/test/test_sb3_action_mask.py index 3835af393..2be85b1d8 100644 --- a/tutorials/SB3/test/test_sb3_action_mask.py +++ b/tutorials/SB3/test/test_sb3_action_mask.py @@ -23,14 +23,14 @@ EASY_ENVS = [ gin_rummy_v4, texas_holdem_no_limit_v6, # texas holdem human rendered game ends instantly, but with random actions it works fine - texas_holdem_v4, + tictactoe_v3, + leduc_holdem_v4, ] # More difficult environments which will likely take more training time MEDIUM_ENVS = [ - leduc_holdem_v4, # with 10x as many steps it gets higher total rewards (9 vs -9), 0.52 winrate, and 0.92 vs 0.83 total scores hanabi_v5, # even with 10x as many steps, total score seems to always be tied between the two agents - tictactoe_v3, # even with 10x as many steps, agent still loses every time (most likely an error somewhere) + texas_holdem_v4, # this performs poorly with updates to SB3 wrapper chess_v6, # difficult to train because games take so long, performance varies heavily ] @@ -50,8 +50,7 @@ def test_action_mask_easy(env_fn): env_kwargs = {} - # Leduc Hold`em takes slightly longer to outperform random - steps = 8192 if env_fn != leduc_holdem_v4 else 8192 * 4 + steps = 8192 * 4 # Train a model against itself (takes ~2 minutes on GPU) train_action_mask(env_fn, steps=steps, seed=0, **env_kwargs) @@ -92,7 +91,7 @@ def test_action_mask_medium(env_fn): assert ( winrate < 0.75 - ), "Policy should not perform better than 75% winrate" # 30-40% for leduc, 0% for hanabi, 0% for tic-tac-toe + ), "Policy should not perform better than 75% winrate" # 30-40% for leduc, 0% for hanabi # Watch two games (disabled by default) # eval_action_mask(env_fn, num_games=2, render_mode="human", **env_kwargs) diff --git a/tutorials/Tianshou/requirements.txt b/tutorials/Tianshou/requirements.txt index b7b8d4a47..b92064488 100644 --- a/tutorials/Tianshou/requirements.txt +++ b/tutorials/Tianshou/requirements.txt @@ -1,3 +1,4 @@ -pettingzoo[classic]==1.23.0 -packaging==21.3 +numpy<2.0.0 +pettingzoo[classic]>=1.23.0 +packaging>=21.3 tianshou==0.5.0