From e93fd91e733c02f4b65916489c4f88bce40b27f0 Mon Sep 17 00:00:00 2001
From: Michael Pratt <118982716+mikepratt1@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:41:10 +0000
Subject: [PATCH 01/22] Agilerl updates (#1196)
---
.github/workflows/linux-tutorials-test.yml | 27 ++++++++++++++++++++-
docs/tutorials/agilerl/DQN.md | 8 +++---
tutorials/AgileRL/agilerl_dqn_curriculum.py | 8 +++---
tutorials/AgileRL/agilerl_maddpg.py | 8 +++---
tutorials/AgileRL/agilerl_matd3.py | 2 +-
tutorials/AgileRL/requirements.txt | 3 +--
6 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/.github/workflows/linux-tutorials-test.yml b/.github/workflows/linux-tutorials-test.yml
index 858203d13..f74a9b3c5 100644
--- a/.github/workflows/linux-tutorials-test.yml
+++ b/.github/workflows/linux-tutorials-test.yml
@@ -17,7 +17,32 @@ jobs:
fail-fast: false
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']
- tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test, AgileRL] # TODO: fix tutorials and add back Ray
+ tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test] # TODO: fix tutorials and add back Ray
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies and run tutorials
+ run: |
+ sudo apt-get install python3-opengl xvfb parallel
+ export PATH=/path/to/parallel:$PATH
+ export root_dir=$(pwd)
+ cd tutorials/${{ matrix.tutorial }}
+ pip install -r requirements.txt
+ pip uninstall -y pettingzoo
+ pip install -e $root_dir[testing]
+ AutoROM -v
+ for f in *.py; do xvfb-run -a -s "-screen 0 1024x768x24" python "$f"; done
+
+ agilerl-tutorial-test:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ['3.9', '3.10', '3.11']
+ tutorial: [AgileRL]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md
index 6a7cc3731..5b701f28c 100644
--- a/docs/tutorials/agilerl/DQN.md
+++ b/docs/tutorials/agilerl/DQN.md
@@ -612,10 +612,10 @@ Before we go any further in this tutorial, it would be helpful to define and set
# Define the network configuration
NET_CONFIG = {
"arch": "cnn", # Network architecture
- "h_size": [64, 64], # Actor hidden size
- "c_size": [128], # CNN channel size
- "k_size": [4], # CNN kernel size
- "s_size": [1], # CNN stride size
+ "hidden_size": [64, 64], # Actor hidden size
+ "channel_size": [128], # CNN channel size
+ "kernel_size": [4], # CNN kernel size
+ "stride_size": [1], # CNN stride size
"normalize": False, # Normalize image from range [0,255] to [0,1]
}
diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
index 1b6e86949..a56464c5f 100644
--- a/tutorials/AgileRL/agilerl_dqn_curriculum.py
+++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py
@@ -494,10 +494,10 @@ def outcome(self, action, player, return_length=False):
# Define the network configuration
NET_CONFIG = {
"arch": "cnn", # Network architecture
- "h_size": [64, 64], # Actor hidden size
- "c_size": [128], # CNN channel size
- "k_size": [4], # CNN kernel size
- "s_size": [1], # CNN stride size
+ "hidden_size": [64, 64], # Actor hidden size
+ "channel_size": [128], # CNN channel size
+ "kernel_size": [4], # CNN kernel size
+ "stride_size": [1], # CNN stride size
"normalize": False, # Normalize image from range [0,255] to [0,1]
}
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index 14a93f6e1..37e193f40 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -22,10 +22,10 @@
# Define the network configuration
NET_CONFIG = {
"arch": "cnn", # Network architecture
- "h_size": [32, 32], # Network hidden size
- "c_size": [32, 32], # CNN channel size
- "k_size": [3, 3], # CNN kernel size
- "s_size": [2, 2], # CNN stride size
+ "hidden_size": [32, 32], # Network hidden size
+ "channel_size": [32, 32], # CNN channel size
+ "kernel_size": [3, 3], # CNN kernel size
+ "stride_size": [2, 2], # CNN stride size
"normalize": True, # Normalize image from range [0,255] to [0,1]
}
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
index 46aefb5bf..cc6ed9009 100644
--- a/tutorials/AgileRL/agilerl_matd3.py
+++ b/tutorials/AgileRL/agilerl_matd3.py
@@ -21,7 +21,7 @@
# Define the network configuration
NET_CONFIG = {
"arch": "mlp", # Network architecture
- "h_size": [32, 32], # Actor hidden size
+ "hidden_size": [32, 32], # Actor hidden size
}
# Define the initial hyperparameters
diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index dbdd050a0..1262ee83c 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,5 +1,4 @@
-agilerl==0.1.21; python_version >= '3.9'
-agilerl==0.1.20; python_version < '3.9'
+agilerl==0.1.22; python_version >= '3.9'
pettingzoo[classic,atari,mpe]>=1.23.1
SuperSuit>=3.9.0
torch>=2.0.1
From 45f7f144c183d27098a4e507cd7cd01609d7e1ac Mon Sep 17 00:00:00 2001
From: Anthony Chang <40710895+Bamboofungus@users.noreply.github.com>
Date: Sat, 30 Mar 2024 00:30:52 +0900
Subject: [PATCH 02/22] Replace images that weren't rendering correctly with
inline LaTeX (#1198)
---
pettingzoo/classic/go/go.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/pettingzoo/classic/go/go.py b/pettingzoo/classic/go/go.py
index 3360f520e..ac1749019 100644
--- a/pettingzoo/classic/go/go.py
+++ b/pettingzoo/classic/go/go.py
@@ -81,14 +81,14 @@
| Action ID | Description |
| :----------------------------------------------------------: | ------------------------------------------------------------ |
-| | Place a stone on the 1st row of the board.
_`0`: (0,0), `1`: (0,1), ..., `N-1`: (0,N-1)_ |
-| | Place a stone on the 2nd row of the board.
_`N`: (1,0), `N+1`: (1,1), ..., `2N-1`: (1,N-1)_ |
+| $0 \ldots (N-1)$ | Place a stone on the 1st row of the board.
_`0`: (0,0), `1`: (0,1), ..., `N-1`: (0,N-1)_ |
+| $N \ldots (2N- 1)$ | Place a stone on the 2nd row of the board.
_`N`: (1,0), `N+1`: (1,1), ..., `2N-1`: (1,N-1)_ |
| ... | ... |
-| | Place a stone on the Nth row of the board.
_`N^2-N`: (N-1,0), `N^2-N+1`: (N-1,1), ..., `N^2-1`: (N-1,N-1)_ |
-| | Pass |
+| $(N^2-N) \ldots (N^2-1)$ | Place a stone on the Nth row of the board.
_`N^2-N`: (N-1,0), `N^2-N+1`: (N-1,1), ..., `N^2-1`: (N-1,N-1)_ |
+| $N^2$ | Pass |
-For example, you would use action `4` to place a stone on the board at the (0,3) location or action `N^2` to pass. You can transform a non-pass action `a` back into its 2D (x,y) coordinate by computing `(a//N, a%N)` The total action space is
-.
+For example, you would use action `4` to place a stone on the board at the (0,3) location or action `N^2` to pass. You can transform a non-pass action `a` back into its 2D (x,y) coordinate by computing `(a//N, a%N)`. The total action space is
+$N^2+1$.
### Rewards
From 849414dfcba7f423f4db02bfecc7ef061cccc567 Mon Sep 17 00:00:00 2001
From: David Ackerman <145808634+dm-ackerman@users.noreply.github.com>
Date: Mon, 1 Apr 2024 13:59:15 -0400
Subject: [PATCH 03/22] Rename class agent_selector -> AgentSelector (#1194)
---
docs/api/utils.md | 2 +-
docs/code_examples/aec_rps.py | 6 ++---
docs/content/environment_creation.md | 6 ++---
.../cooperative_pong/cooperative_pong.py | 4 ++--
.../knights_archers_zombies.py | 4 ++--
pettingzoo/butterfly/pistonball/pistonball.py | 4 ++--
pettingzoo/classic/chess/chess.py | 6 ++---
.../classic/connect_four/connect_four.py | 4 ++--
pettingzoo/classic/go/go.py | 4 ++--
pettingzoo/classic/hanabi/hanabi.py | 4 ++--
pettingzoo/classic/rps/rps.py | 4 ++--
pettingzoo/classic/tictactoe/tictactoe.py | 4 ++--
pettingzoo/mpe/_mpe_utils/simple_env.py | 4 ++--
pettingzoo/sisl/multiwalker/multiwalker.py | 4 ++--
pettingzoo/sisl/pursuit/pursuit.py | 4 ++--
pettingzoo/sisl/waterworld/waterworld.py | 4 ++--
...enerated_agents_env_action_mask_info_v0.py | 4 ++--
...generated_agents_env_action_mask_obs_v0.py | 4 ++--
.../generated_agents_env_cust_agentid_v0.py | 4 ++--
.../example_envs/generated_agents_env_v0.py | 4 ++--
pettingzoo/utils/__init__.py | 2 +-
pettingzoo/utils/agent_selector.py | 22 ++++++++++++++-----
pettingzoo/utils/conversions.py | 6 ++---
23 files changed, 63 insertions(+), 51 deletions(-)
diff --git a/docs/api/utils.md b/docs/api/utils.md
index abc9d01fc..0b0e319cb 100644
--- a/docs/api/utils.md
+++ b/docs/api/utils.md
@@ -165,7 +165,7 @@ Base class which is used by [CaptureStdoutWrapper](https://pettingzoo.farama.org
The agent selector utility allows for easy cycling of agents in an AEC environment. At any time it can be reset or reinitialized with a new order, allowing for changes in turn order or handling a dynamic number of agents (see [Knights-Archers-Zombies](https://pettingzoo.farama.org/environments/butterfly/knights_archers_zombies/) for an example of spawning/killing agents)
-Note: while many PettingZoo environments use agent_selector to manage agent cycling internally, it is not intended to be used externally when interacting with an environment. Instead, use `for agent in env.agent_iter()` (see [AEC API Usage](https://pettingzoo.farama.org/api/aec/#usage)).
+Note: while many PettingZoo environments use AgentSelector to manage agent cycling internally, it is not intended to be used externally when interacting with an environment. Instead, use `for agent in env.agent_iter()` (see [AEC API Usage](https://pettingzoo.farama.org/api/aec/#usage)).
```{eval-rst}
.. currentmodule:: pettingzoo.utils
diff --git a/docs/code_examples/aec_rps.py b/docs/code_examples/aec_rps.py
index 7ae982167..7272f75bd 100644
--- a/docs/code_examples/aec_rps.py
+++ b/docs/code_examples/aec_rps.py
@@ -5,7 +5,7 @@
from gymnasium.spaces import Discrete
from pettingzoo import AECEnv
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
ROCK = 0
PAPER = 1
@@ -156,9 +156,9 @@ def reset(self, seed=None, options=None):
self.observations = {agent: NONE for agent in self.agents}
self.num_moves = 0
"""
- Our agent_selector utility allows easy cyclic stepping through the agents list.
+ Our AgentSelector utility allows easy cyclic stepping through the agents list.
"""
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.next()
def step(self, action):
diff --git a/docs/content/environment_creation.md b/docs/content/environment_creation.md
index 8b4332872..4347c49c7 100644
--- a/docs/content/environment_creation.md
+++ b/docs/content/environment_creation.md
@@ -62,14 +62,14 @@ The utils directory also contain some classes which are only helpful for develop
### Agent selector
-The `agent_selector` class steps through agents in a cycle
+The `AgentSelector` class steps through agents in a cycle
It can be used as follows to cycle through the list of agents:
```python
-from pettingzoo.utils import agent_selector
+from pettingzoo.utils import AgentSelector
agents = ["agent_1", "agent_2", "agent_3"]
-selector = agent_selector(agents)
+selector = AgentSelector(agents)
agent_selection = selector.reset()
# agent_selection will be "agent_1"
for i in range(100):
diff --git a/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py b/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py
index 0751a12e7..4573769fc 100644
--- a/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py
+++ b/pettingzoo/butterfly/cooperative_pong/cooperative_pong.py
@@ -79,7 +79,7 @@
from pettingzoo.butterfly.cooperative_pong.manual_policy import ManualPolicy
from pettingzoo.butterfly.cooperative_pong.paddle import Paddle
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
from pettingzoo.utils.conversions import parallel_wrapper_fn
FPS = 15
@@ -370,7 +370,7 @@ def __init__(self, **kwargs):
self.agents = self.env.agents[:]
self.possible_agents = self.agents[:]
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
# spaces
self.action_spaces = dict(zip(self.agents, self.env.action_space))
diff --git a/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py b/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py
index 0f21753e6..68a9bdfdc 100644
--- a/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py
+++ b/pettingzoo/butterfly/knights_archers_zombies/knights_archers_zombies.py
@@ -194,7 +194,7 @@
from pettingzoo.butterfly.knights_archers_zombies.src.players import Archer, Knight
from pettingzoo.butterfly.knights_archers_zombies.src.weapons import Arrow, Sword
from pettingzoo.butterfly.knights_archers_zombies.src.zombie import Zombie
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
from pettingzoo.utils.conversions import parallel_wrapper_fn
sys.dont_write_bytecode = True
@@ -370,7 +370,7 @@ def __init__(
self.floor_patch3 = get_image(os.path.join("img", "patch3.png"))
self.floor_patch4 = get_image(os.path.join("img", "patch4.png"))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.reinit()
def observation_space(self, agent):
diff --git a/pettingzoo/butterfly/pistonball/pistonball.py b/pettingzoo/butterfly/pistonball/pistonball.py
index b15ea2872..65415593b 100644
--- a/pettingzoo/butterfly/pistonball/pistonball.py
+++ b/pettingzoo/butterfly/pistonball/pistonball.py
@@ -89,7 +89,7 @@
from pettingzoo import AECEnv
from pettingzoo.butterfly.pistonball.manual_policy import ManualPolicy
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
from pettingzoo.utils.conversions import parallel_wrapper_fn
_image_library = {}
@@ -180,7 +180,7 @@ def __init__(
self.agents = ["piston_" + str(r) for r in range(self.n_pistons)]
self.possible_agents = self.agents[:]
self.agent_name_mapping = dict(zip(self.agents, list(range(self.n_pistons))))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.observation_spaces = dict(
zip(
diff --git a/pettingzoo/classic/chess/chess.py b/pettingzoo/classic/chess/chess.py
index 5100f8fc3..81b2ccb31 100644
--- a/pettingzoo/classic/chess/chess.py
+++ b/pettingzoo/classic/chess/chess.py
@@ -116,7 +116,7 @@
from pettingzoo import AECEnv
from pettingzoo.classic.chess import chess_utils
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def env(**kwargs):
@@ -144,7 +144,7 @@ def __init__(self, render_mode: str | None = None, screen_height: int | None = 8
self.agents = [f"player_{i}" for i in range(2)]
self.possible_agents = self.agents[:]
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.action_spaces = {name: spaces.Discrete(8 * 8 * 73) for name in self.agents}
self.observation_spaces = {
@@ -238,7 +238,7 @@ def reset(self, seed=None, options=None):
self.board = chess.Board()
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
self.rewards = {name: 0 for name in self.agents}
diff --git a/pettingzoo/classic/connect_four/connect_four.py b/pettingzoo/classic/connect_four/connect_four.py
index e2a2390e9..48ce61ce1 100644
--- a/pettingzoo/classic/connect_four/connect_four.py
+++ b/pettingzoo/classic/connect_four/connect_four.py
@@ -69,7 +69,7 @@
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def get_image(path):
@@ -220,7 +220,7 @@ def reset(self, seed=None, options=None):
self.truncations = {i: False for i in self.agents}
self.infos = {i: {} for i in self.agents}
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
diff --git a/pettingzoo/classic/go/go.py b/pettingzoo/classic/go/go.py
index ac1749019..d9a865c67 100644
--- a/pettingzoo/classic/go/go.py
+++ b/pettingzoo/classic/go/go.py
@@ -119,7 +119,7 @@
from pettingzoo import AECEnv
from pettingzoo.classic.go import coords, go_base
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def get_image(path):
@@ -191,7 +191,7 @@ def __init__(
[spaces.Discrete(self._N * self._N + 1) for _ in range(self.num_agents)]
)
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.board_history = np.zeros((self._N, self._N, 16), dtype=bool)
diff --git a/pettingzoo/classic/hanabi/hanabi.py b/pettingzoo/classic/hanabi/hanabi.py
index bd2f7480f..bd4441401 100644
--- a/pettingzoo/classic/hanabi/hanabi.py
+++ b/pettingzoo/classic/hanabi/hanabi.py
@@ -171,7 +171,7 @@
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def env(**kwargs):
@@ -441,7 +441,7 @@ def reset(self, seed=None, options=None):
self.truncations = self.hanabi_env.truncations
self.infos = self.hanabi_env.infos
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
def step(
diff --git a/pettingzoo/classic/rps/rps.py b/pettingzoo/classic/rps/rps.py
index 1b9eb6ad6..83c5abb3f 100644
--- a/pettingzoo/classic/rps/rps.py
+++ b/pettingzoo/classic/rps/rps.py
@@ -121,7 +121,7 @@
from gymnasium.utils import EzPickle
from pettingzoo import AECEnv
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
from pettingzoo.utils.conversions import parallel_wrapper_fn
@@ -419,7 +419,7 @@ def close(self):
def reset(self, seed=None, options=None):
self.agents = self.possible_agents[:]
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.next()
self.rewards = {agent: 0 for agent in self.agents}
self._cumulative_rewards = {agent: 0 for agent in self.agents}
diff --git a/pettingzoo/classic/tictactoe/tictactoe.py b/pettingzoo/classic/tictactoe/tictactoe.py
index 45d357b6f..e68f900a8 100644
--- a/pettingzoo/classic/tictactoe/tictactoe.py
+++ b/pettingzoo/classic/tictactoe/tictactoe.py
@@ -80,7 +80,7 @@
from pettingzoo import AECEnv
from pettingzoo.classic.tictactoe.board import Board
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
def get_image(path):
@@ -143,7 +143,7 @@ def __init__(
self.truncations = {i: False for i in self.agents}
self.infos = {i: {"legal_moves": list(range(0, 9))} for i in self.agents}
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
self.render_mode = render_mode
diff --git a/pettingzoo/mpe/_mpe_utils/simple_env.py b/pettingzoo/mpe/_mpe_utils/simple_env.py
index 6d420fe76..af95b64d4 100644
--- a/pettingzoo/mpe/_mpe_utils/simple_env.py
+++ b/pettingzoo/mpe/_mpe_utils/simple_env.py
@@ -9,7 +9,7 @@
from pettingzoo import AECEnv
from pettingzoo.mpe._mpe_utils.core import Agent
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
@@ -75,7 +75,7 @@ def __init__(
agent.name: idx for idx, agent in enumerate(self.world.agents)
}
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
# set spaces
self.action_spaces = dict()
diff --git a/pettingzoo/sisl/multiwalker/multiwalker.py b/pettingzoo/sisl/multiwalker/multiwalker.py
index 8edf250d1..30adb9fe0 100644
--- a/pettingzoo/sisl/multiwalker/multiwalker.py
+++ b/pettingzoo/sisl/multiwalker/multiwalker.py
@@ -125,7 +125,7 @@
from pettingzoo import AECEnv
from pettingzoo.sisl.multiwalker.multiwalker_base import FPS
from pettingzoo.sisl.multiwalker.multiwalker_base import MultiWalkerEnv as _env
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
from pettingzoo.utils.conversions import parallel_wrapper_fn
@@ -156,7 +156,7 @@ def __init__(self, *args, **kwargs):
self.agent_name_mapping = dict(
zip(self.agents, list(range(self.env.n_walkers)))
)
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
# spaces
self.action_spaces = dict(zip(self.agents, self.env.action_space))
self.observation_spaces = dict(zip(self.agents, self.env.observation_space))
diff --git a/pettingzoo/sisl/pursuit/pursuit.py b/pettingzoo/sisl/pursuit/pursuit.py
index c75728d31..c68f189bb 100644
--- a/pettingzoo/sisl/pursuit/pursuit.py
+++ b/pettingzoo/sisl/pursuit/pursuit.py
@@ -85,7 +85,7 @@
from pettingzoo import AECEnv
from pettingzoo.sisl.pursuit.manual_policy import ManualPolicy
from pettingzoo.sisl.pursuit.pursuit_base import Pursuit as _env
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
from pettingzoo.utils.conversions import parallel_wrapper_fn
__all__ = ["ManualPolicy", "env", "parallel_env", "raw_env"]
@@ -118,7 +118,7 @@ def __init__(self, *args, **kwargs):
self.agents = ["pursuer_" + str(a) for a in range(self.env.num_agents)]
self.possible_agents = self.agents[:]
self.agent_name_mapping = dict(zip(self.agents, list(range(self.num_agents))))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
# spaces
self.n_act_agents = self.env.act_dims[0]
self.action_spaces = dict(zip(self.agents, self.env.action_space))
diff --git a/pettingzoo/sisl/waterworld/waterworld.py b/pettingzoo/sisl/waterworld/waterworld.py
index d2de2eb21..7684d7206 100644
--- a/pettingzoo/sisl/waterworld/waterworld.py
+++ b/pettingzoo/sisl/waterworld/waterworld.py
@@ -141,7 +141,7 @@
from pettingzoo import AECEnv
from pettingzoo.sisl.waterworld.waterworld_base import FPS
from pettingzoo.sisl.waterworld.waterworld_base import WaterworldBase as _env
-from pettingzoo.utils import agent_selector, wrappers
+from pettingzoo.utils import AgentSelector, wrappers
from pettingzoo.utils.conversions import parallel_wrapper_fn
@@ -171,7 +171,7 @@ def __init__(self, *args, **kwargs):
self.agents = ["pursuer_" + str(r) for r in range(self.env.num_agents)]
self.possible_agents = self.agents[:]
self.agent_name_mapping = dict(zip(self.agents, list(range(self.num_agents))))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
# spaces
self.action_spaces = dict(zip(self.agents, self.env.action_space))
diff --git a/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py b/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py
index 2985a07c6..1c48d6083 100644
--- a/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py
+++ b/pettingzoo/test/example_envs/generated_agents_env_action_mask_info_v0.py
@@ -5,7 +5,7 @@
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def env():
@@ -105,7 +105,7 @@ def reset(self, seed=None, options=None):
for i in range(5):
self.add_agent(self.np_random.choice(self.types))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
# seed observation and action spaces
diff --git a/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py b/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py
index b7cbf2b30..726afa6a9 100644
--- a/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py
+++ b/pettingzoo/test/example_envs/generated_agents_env_action_mask_obs_v0.py
@@ -5,7 +5,7 @@
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def env():
@@ -107,7 +107,7 @@ def reset(self, seed=None, options=None):
for i in range(5):
self.add_agent(self.np_random.choice(self.types))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
# seed observation and action spaces
diff --git a/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py b/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py
index 7f307d5e8..5b966b174 100644
--- a/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py
+++ b/pettingzoo/test/example_envs/generated_agents_env_cust_agentid_v0.py
@@ -5,7 +5,7 @@
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def env():
@@ -99,7 +99,7 @@ def reset(self, seed=None, options=None):
for i in range(5):
self.add_agent(self.np_random.choice(self.types))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
# seed observation and action spaces
diff --git a/pettingzoo/test/example_envs/generated_agents_env_v0.py b/pettingzoo/test/example_envs/generated_agents_env_v0.py
index 28f11469b..827465382 100644
--- a/pettingzoo/test/example_envs/generated_agents_env_v0.py
+++ b/pettingzoo/test/example_envs/generated_agents_env_v0.py
@@ -5,7 +5,7 @@
from pettingzoo import AECEnv
from pettingzoo.utils import wrappers
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
def env():
@@ -99,7 +99,7 @@ def reset(self, seed=None, options=None):
for i in range(5):
self.add_agent(self.np_random.choice(self.types))
- self._agent_selector = agent_selector(self.agents)
+ self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
# seed observation and action spaces
diff --git a/pettingzoo/utils/__init__.py b/pettingzoo/utils/__init__.py
index af9445539..1d16fe76b 100644
--- a/pettingzoo/utils/__init__.py
+++ b/pettingzoo/utils/__init__.py
@@ -1,4 +1,4 @@
-from pettingzoo.utils.agent_selector import agent_selector
+from pettingzoo.utils.agent_selector import AgentSelector
from pettingzoo.utils.average_total_reward import average_total_reward
from pettingzoo.utils.conversions import (
aec_to_parallel,
diff --git a/pettingzoo/utils/agent_selector.py b/pettingzoo/utils/agent_selector.py
index 0b6222990..2643b1c9a 100644
--- a/pettingzoo/utils/agent_selector.py
+++ b/pettingzoo/utils/agent_selector.py
@@ -1,16 +1,17 @@
from __future__ import annotations
from typing import Any
+from warnings import warn
-class agent_selector:
+class AgentSelector:
"""Outputs an agent in the given order whenever agent_select is called.
Can reinitialize to a new order.
Example:
- >>> from pettingzoo.utils import agent_selector
- >>> agent_selector = agent_selector(agent_order=["player1", "player2"])
+ >>> from pettingzoo.utils import AgentSelector
+ >>> agent_selector = AgentSelector(agent_order=["player1", "player2"])
>>> agent_selector.reset()
'player1'
>>> agent_selector.next()
@@ -52,8 +53,8 @@ def is_first(self) -> bool:
"""Check if the current agent is the first agent in the cycle."""
return self.selected_agent == self.agent_order[0]
- def __eq__(self, other: agent_selector) -> bool:
- if not isinstance(other, agent_selector):
+ def __eq__(self, other: AgentSelector) -> bool:
+ if not isinstance(other, AgentSelector):
return NotImplemented
return (
@@ -61,3 +62,14 @@ def __eq__(self, other: agent_selector) -> bool:
and self._current_agent == other._current_agent
and self.selected_agent == other.selected_agent
)
+
+
+class agent_selector(AgentSelector):
+ """Deprecated version of AgentSelector. Use that instead."""
+
+ def __init__(self, *args, **kwargs):
+ warn(
+ "agent_selector is deprecated, please use AgentSelector",
+ DeprecationWarning,
+ )
+ super().__init__(*args, **kwargs)
diff --git a/pettingzoo/utils/conversions.py b/pettingzoo/utils/conversions.py
index 601a1fb06..7cf99f6d9 100644
--- a/pettingzoo/utils/conversions.py
+++ b/pettingzoo/utils/conversions.py
@@ -4,7 +4,7 @@
from collections import defaultdict
from typing import Callable, Dict, Optional
-from pettingzoo.utils import agent_selector
+from pettingzoo.utils import AgentSelector
from pettingzoo.utils.env import ActionType, AECEnv, AgentID, ObsType, ParallelEnv
from pettingzoo.utils.wrappers import OrderEnforcingWrapper
@@ -309,7 +309,7 @@ def reset(self, seed=None, options=None):
self._actions: Dict[AgentID, Optional[ActionType]] = {
agent: None for agent in self.agents
}
- self._agent_selector = agent_selector(self._live_agents)
+ self._agent_selector = AgentSelector(self._live_agents)
self.agent_selection = self._agent_selector.reset()
self.terminations = {agent: False for agent in self.agents}
self.truncations = {agent: False for agent in self.agents}
@@ -377,7 +377,7 @@ def step(self, action: Optional[ActionType]):
]
if len(self.env.agents):
- self._agent_selector = agent_selector(self.env.agents)
+ self._agent_selector = AgentSelector(self.env.agents)
self.agent_selection = self._agent_selector.reset()
self._deads_step_first()
From 6f9df27f924e8aa2906837e59b6afacdbab7d2f7 Mon Sep 17 00:00:00 2001
From: Joras Oliveira <43121361+JorasOliveira@users.noreply.github.com>
Date: Sat, 27 Apr 2024 10:59:33 -0300
Subject: [PATCH 04/22] Update third_party_envs.md (#1201)
---
docs/environments/third_party_envs.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/docs/environments/third_party_envs.md b/docs/environments/third_party_envs.md
index aeca31fb9..928904be9 100644
--- a/docs/environments/third_party_envs.md
+++ b/docs/environments/third_party_envs.md
@@ -57,6 +57,12 @@ CookingZoo: a gym-cooking derivative to simulate a complex cooking environment.
A library for doing reinforcement learning using [Crazyflie](https://www.bitcraze.io/products/crazyflie-2-1/) drones.
+### [DSSE: Drone Swarm Search Environment](https://github.com/pfeinsper/drone-swarm-search)
+[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.22.3-blue)]()
+![GitHub stars](https://img.shields.io/github/stars/pfeinsper/drone-swarm-search)
+
+A single and multi-agent environment to train swarms of drones for maritime search.
+
### [PettingZoo Dilemma Envs](https://github.com/tianyu-z/pettingzoo_dilemma_envs)
From 38e252020d0b067b8c04f7fd9e47c2943691a184 Mon Sep 17 00:00:00 2001
From: David Ackerman <145808634+dm-ackerman@users.noreply.github.com>
Date: Fri, 3 May 2024 17:35:06 -0400
Subject: [PATCH 05/22] Fix bug in SB3 tutorial ActionMask (#1203)
---
.../sb3_connect_four_action_mask.py | 18 ++++++++++++++++--
tutorials/SB3/test/test_sb3_action_mask.py | 9 ++++-----
2 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py b/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
index d8d890362..29d623251 100644
--- a/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
+++ b/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
@@ -37,9 +37,23 @@ def reset(self, seed=None, options=None):
return self.observe(self.agent_selection), {}
def step(self, action):
- """Gymnasium-like step function, returning observation, reward, termination, truncation, info."""
+ """Gymnasium-like step function, returning observation, reward, termination, truncation, info.
+
+ The observation is for the next agent (used to determine the next action), while the remaining
+ items are for the agent that just acted (used to understand what just happened).
+ """
+ current_agent = self.agent_selection
+
super().step(action)
- return super().last()
+
+ next_agent = self.agent_selection
+ return (
+ self.observe(next_agent),
+ self._cumulative_rewards[current_agent],
+ self.terminations[current_agent],
+ self.truncations[current_agent],
+ self.infos[current_agent],
+ )
def observe(self, agent):
"""Return only raw observation, removing action mask."""
diff --git a/tutorials/SB3/test/test_sb3_action_mask.py b/tutorials/SB3/test/test_sb3_action_mask.py
index 3835af393..de4ee3c07 100644
--- a/tutorials/SB3/test/test_sb3_action_mask.py
+++ b/tutorials/SB3/test/test_sb3_action_mask.py
@@ -23,14 +23,14 @@
EASY_ENVS = [
gin_rummy_v4,
texas_holdem_no_limit_v6, # texas holdem human rendered game ends instantly, but with random actions it works fine
- texas_holdem_v4,
+ tictactoe_v3,
+ leduc_holdem_v4,
]
# More difficult environments which will likely take more training time
MEDIUM_ENVS = [
- leduc_holdem_v4, # with 10x as many steps it gets higher total rewards (9 vs -9), 0.52 winrate, and 0.92 vs 0.83 total scores
hanabi_v5, # even with 10x as many steps, total score seems to always be tied between the two agents
- tictactoe_v3, # even with 10x as many steps, agent still loses every time (most likely an error somewhere)
+ texas_holdem_v4, # this performs poorly with updates to SB3 wrapper
chess_v6, # difficult to train because games take so long, performance varies heavily
]
@@ -50,8 +50,7 @@ def test_action_mask_easy(env_fn):
env_kwargs = {}
- # Leduc Hold`em takes slightly longer to outperform random
- steps = 8192 if env_fn != leduc_holdem_v4 else 8192 * 4
+ steps = 8192 * 4
# Train a model against itself (takes ~2 minutes on GPU)
train_action_mask(env_fn, steps=steps, seed=0, **env_kwargs)
From 98e8c206daca7b45e1b61785d245628872c149b6 Mon Sep 17 00:00:00 2001
From: David Ackerman <145808634+dm-ackerman@users.noreply.github.com>
Date: Fri, 3 May 2024 17:40:37 -0400
Subject: [PATCH 06/22] Update the TicTacToe environment (#1192)
Co-authored-by: Elliot Tower
---
pettingzoo/classic/tictactoe/board.py | 155 ++++++++++++---------
pettingzoo/classic/tictactoe/test_board.py | 127 +++++++++++++++++
pettingzoo/classic/tictactoe/tictactoe.py | 86 +++++-------
tutorials/SB3/test/test_sb3_action_mask.py | 2 +-
4 files changed, 251 insertions(+), 119 deletions(-)
create mode 100644 pettingzoo/classic/tictactoe/test_board.py
diff --git a/pettingzoo/classic/tictactoe/board.py b/pettingzoo/classic/tictactoe/board.py
index 35186a57a..e6fee6853 100644
--- a/pettingzoo/classic/tictactoe/board.py
+++ b/pettingzoo/classic/tictactoe/board.py
@@ -1,79 +1,102 @@
+TTT_PLAYER1_WIN = 0
+TTT_PLAYER2_WIN = 1
+TTT_TIE = -1
+TTT_GAME_NOT_OVER = -2
+
+
class Board:
+ """Board for a TicTacToe Game.
+
+ This tracks the position and identity of marks on the game board
+ and allows checking for a winner.
+
+ Example of usage:
+
+ import random
+ board = Board()
+
+ # random legal moves - for example purposes
+ def choose_move(board_obj: Board) -> int:
+ legal_moves = [i for i, mark in enumerate(board_obj.squares) if mark == 0]
+ return random.choice(legal_moves)
+
+ player = 0
+ while True:
+ move = choose_move(board)
+ board.play_turn(player, move)
+ status = board.game_status()
+ if status != TTT_GAME_NOT_OVER:
+ if status in [TTT_PLAYER1_WIN, TTT_PLAYER2_WIN]:
+ print(f"player {status} won")
+ else: # status == TTT_TIE
+ print("Tie Game")
+ break
+ player = player ^ 1 # swaps between players 0 and 1
+ """
+
+ # indices of the winning lines: vertical(x3), horizontal(x3), diagonal(x2)
+ winning_combinations = [
+ (0, 1, 2),
+ (3, 4, 5),
+ (6, 7, 8),
+ (0, 3, 6),
+ (1, 4, 7),
+ (2, 5, 8),
+ (0, 4, 8),
+ (2, 4, 6),
+ ]
+
def __init__(self):
- # internally self.board.squares holds a flat representation of tic tac toe board
- # where an empty board is [0, 0, 0, 0, 0, 0, 0, 0, 0]
- # where indexes are column wise order
+ # self.squares holds a flat representation of the tic tac toe board.
+ # an empty board is [0, 0, 0, 0, 0, 0, 0, 0, 0].
+ # player 1's squares are marked 1, while player 2's are marked 2.
+ # mapping of the flat indices to the 3x3 grid is as follows:
# 0 3 6
# 1 4 7
# 2 5 8
-
- # empty -- 0
- # player 0 -- 1
- # player 1 -- 2
self.squares = [0] * 9
- # precommute possible winning combinations
- self.calculate_winners()
+ @property
+ def _n_empty_squares(self):
+ """The current number of empty squares on the board."""
+ return self.squares.count(0)
- def setup(self):
- self.calculate_winners()
+ def reset(self):
+ """Remove all marks from the board."""
+ self.squares = [0] * 9
def play_turn(self, agent, pos):
- # if spot is empty
- if self.squares[pos] != 0:
- return
- if agent == 0:
- self.squares[pos] = 1
- elif agent == 1:
- self.squares[pos] = 2
- return
-
- def calculate_winners(self):
- winning_combinations = []
- indices = [x for x in range(0, 9)]
-
- # Vertical combinations
- winning_combinations += [
- tuple(indices[i : (i + 3)]) for i in range(0, len(indices), 3)
- ]
-
- # Horizontal combinations
- winning_combinations += [
- tuple(indices[x] for x in range(y, len(indices), 3)) for y in range(0, 3)
- ]
-
- # Diagonal combinations
- winning_combinations.append(tuple(x for x in range(0, len(indices), 4)))
- winning_combinations.append(tuple(x for x in range(2, len(indices) - 1, 2)))
-
- self.winning_combinations = winning_combinations
-
- # returns:
- # -1 for no winner
- # 1 -- agent 0 wins
- # 2 -- agent 1 wins
- def check_for_winner(self):
- winner = -1
- for combination in self.winning_combinations:
- states = []
- for index in combination:
- states.append(self.squares[index])
- if all(x == 1 for x in states):
- winner = 1
- if all(x == 2 for x in states):
- winner = 2
- return winner
-
- def check_game_over(self):
- winner = self.check_for_winner()
-
- if winner == -1 and all(square in [1, 2] for square in self.squares):
- # tie
- return True
- elif winner in [1, 2]:
- return True
- else:
- return False
+ """Place a mark by the agent in the spot given.
+
+ The following are required for a move to be valid:
+ * The agent must be a known agent ID (either 0 or 1).
+ * The spot must be be empty.
+ * The spot must be in the board (integer: 0 <= spot <= 8)
+
+ If any of those are not true, an assertion will fail.
+ """
+ assert pos >= 0 and pos <= 8, "Invalid move location"
+ assert agent in [0, 1], "Invalid agent"
+ assert self.squares[pos] == 0, "Location is not empty"
+
+ # agent is [0, 1]. board values are stored as [1, 2].
+ self.squares[pos] = agent + 1
+
+ def game_status(self):
+ """Return status (winner, TTT_TIE if no winner, or TTT_GAME_NOT_OVER)."""
+ for indices in self.winning_combinations:
+ states = [self.squares[idx] for idx in indices]
+ if states == [1, 1, 1]:
+ return TTT_PLAYER1_WIN
+ if states == [2, 2, 2]:
+ return TTT_PLAYER2_WIN
+ if self._n_empty_squares == 0:
+ return TTT_TIE
+ return TTT_GAME_NOT_OVER
def __str__(self):
return str(self.squares)
+
+ def legal_moves(self):
+ """Return list of legal moves (as flat indices for spaces on the board)."""
+ return [i for i, mark in enumerate(self.squares) if mark == 0]
diff --git a/pettingzoo/classic/tictactoe/test_board.py b/pettingzoo/classic/tictactoe/test_board.py
new file mode 100644
index 000000000..b8f7e9248
--- /dev/null
+++ b/pettingzoo/classic/tictactoe/test_board.py
@@ -0,0 +1,127 @@
+"""Test cases for TicTacToe board."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from pettingzoo.classic.tictactoe.board import ( # type: ignore
+ TTT_GAME_NOT_OVER,
+ TTT_PLAYER1_WIN,
+ TTT_PLAYER2_WIN,
+ TTT_TIE,
+ Board,
+)
+
+# Note: mapping of moves to board positions are:
+# 0 3 6
+# 1 4 7
+# 2 5 8
+
+agent2_win = {
+ "moves": [
+ # agent_id, position, board after move
+ (0, 4, [0, 0, 0, 0, 1, 0, 0, 0, 0]),
+ (1, 0, [2, 0, 0, 0, 1, 0, 0, 0, 0]),
+ (0, 2, [2, 0, 1, 0, 1, 0, 0, 0, 0]),
+ (1, 6, [2, 0, 1, 0, 1, 0, 2, 0, 0]),
+ (0, 3, [2, 0, 1, 1, 1, 0, 2, 0, 0]),
+ (1, 7, [2, 0, 1, 1, 1, 0, 2, 2, 0]),
+ (0, 1, [2, 1, 1, 1, 1, 0, 2, 2, 0]),
+ (1, 8, [2, 1, 1, 1, 1, 0, 2, 2, 2]), # agent 2 wins here
+ (0, 5, [2, 1, 1, 1, 1, 1, 2, 2, 2]),
+ ],
+ "max_step": 7, # should not get past here
+ "winner": TTT_PLAYER2_WIN,
+}
+
+tie = {
+ "moves": [ # should be tie
+ (0, 0, [1, 0, 0, 0, 0, 0, 0, 0, 0]),
+ (1, 3, [1, 0, 0, 2, 0, 0, 0, 0, 0]),
+ (0, 1, [1, 1, 0, 2, 0, 0, 0, 0, 0]),
+ (1, 4, [1, 1, 0, 2, 2, 0, 0, 0, 0]),
+ (0, 5, [1, 1, 0, 2, 2, 1, 0, 0, 0]),
+ (1, 2, [1, 1, 2, 2, 2, 1, 0, 0, 0]),
+ (0, 6, [1, 1, 2, 2, 2, 1, 1, 0, 0]),
+ (1, 7, [1, 1, 2, 2, 2, 1, 1, 2, 0]),
+ (0, 8, [1, 1, 2, 2, 2, 1, 1, 2, 1]),
+ ],
+ "max_step": 8,
+ "winner": TTT_TIE,
+}
+
+agent1_win = {
+ "moves": [
+ (0, 0, [1, 0, 0, 0, 0, 0, 0, 0, 0]),
+ (1, 3, [1, 0, 0, 2, 0, 0, 0, 0, 0]),
+ (0, 1, [1, 1, 0, 2, 0, 0, 0, 0, 0]),
+ (1, 4, [1, 1, 0, 2, 2, 0, 0, 0, 0]),
+ (0, 2, [1, 1, 1, 2, 2, 0, 0, 0, 0]), # agent 1 should win here
+ (1, 5, [1, 1, 1, 2, 2, 2, 0, 0, 0]),
+ (0, 6, [1, 1, 1, 2, 2, 2, 1, 0, 0]),
+ (1, 7, [1, 1, 1, 2, 2, 2, 1, 2, 0]),
+ (0, 8, [1, 1, 1, 2, 2, 2, 1, 2, 1]),
+ ],
+ "max_step": 4,
+ "winner": TTT_PLAYER1_WIN,
+}
+
+
+@pytest.mark.parametrize("values", [agent1_win, agent2_win, tie])
+def test_tictactoe_board_games(values: dict[str, Any]) -> None:
+ """Test that TicTacToe games go as expected."""
+ expected_winner = values["winner"]
+ max_step = values["max_step"]
+
+ board = Board()
+ for i, (agent, pos, board_layout) in enumerate(values["moves"]):
+ assert i <= max_step, "max step exceed in tictactoe game"
+ board.play_turn(agent, pos)
+ assert board_layout == board.squares, "wrong tictactoe layout after move"
+ status = board.game_status()
+ if status != TTT_GAME_NOT_OVER:
+ assert i == max_step, "tictactoe game ended on wrong step"
+ assert status == expected_winner, "wrong winner in tictactoe board test"
+ break
+
+
+def test_tictactoe_winning_boards() -> None:
+ """Test that winning board configurations actually win."""
+ # these are the winning lines for player 1. Note that moves
+ # for player 2 are included to make it a legal board.
+ winning_lines = [ # vertical(x3), horizontal(x3), diagonal(x2)
+ [1, 1, 1, 0, 0, 0, 0, 0, 0],
+ [0, 0, 0, 1, 1, 1, 0, 0, 0],
+ [0, 0, 0, 0, 0, 0, 1, 1, 1],
+ [1, 0, 0, 1, 0, 0, 1, 0, 0],
+ [0, 1, 0, 0, 1, 0, 0, 1, 0],
+ [0, 0, 1, 0, 0, 1, 0, 0, 1],
+ [1, 0, 0, 0, 1, 0, 0, 0, 1],
+ [0, 0, 1, 0, 1, 0, 1, 0, 0],
+ ]
+ for line in winning_lines:
+ board = Board()
+ board.squares = line
+ assert board.game_status() == TTT_PLAYER1_WIN, "Bad win check in TicTacToe"
+
+
+def test_tictactoe_bad_move() -> None:
+ """Test that illegal TicTacToe moves are rejected."""
+ board = Board()
+ # 1) move out of bounds should be rejected
+ for outside_space in [-1, 9]:
+ with pytest.raises(AssertionError, match="Invalid move location"):
+ board.play_turn(0, outside_space)
+
+ # 2) move by unknown agent should be rejected
+ for unknown_agent in [-1, 2]:
+ with pytest.raises(AssertionError, match="Invalid agent"):
+ board.play_turn(unknown_agent, 0)
+
+ # 3) move in occupied space by either agent should be rejected
+ board.play_turn(0, 4) # this is fine
+ for agent in [0, 1]:
+ with pytest.raises(AssertionError, match="Location is not empty"):
+ board.play_turn(agent, 4) # repeating move is not valid
diff --git a/pettingzoo/classic/tictactoe/tictactoe.py b/pettingzoo/classic/tictactoe/tictactoe.py
index e68f900a8..e3c219c5a 100644
--- a/pettingzoo/classic/tictactoe/tictactoe.py
+++ b/pettingzoo/classic/tictactoe/tictactoe.py
@@ -79,11 +79,12 @@
from gymnasium.utils import EzPickle
from pettingzoo import AECEnv
-from pettingzoo.classic.tictactoe.board import Board
+from pettingzoo.classic.tictactoe.board import TTT_GAME_NOT_OVER, TTT_TIE, Board
from pettingzoo.utils import AgentSelector, wrappers
def get_image(path):
+ """Return a pygame image loaded from the given path."""
from os import path as os_path
cwd = os_path.dirname(__file__)
@@ -92,6 +93,7 @@ def get_image(path):
def get_font(path, size):
+ """Return a pygame font loaded from the given path."""
from os import path as os_path
cwd = os_path.dirname(__file__)
@@ -141,7 +143,7 @@ def __init__(
self.rewards = {i: 0 for i in self.agents}
self.terminations = {i: False for i in self.agents}
self.truncations = {i: False for i in self.agents}
- self.infos = {i: {"legal_moves": list(range(0, 9))} for i in self.agents}
+ self.infos = {i: {} for i in self.agents}
self._agent_selector = AgentSelector(self.agents)
self.agent_selection = self._agent_selector.reset()
@@ -153,42 +155,38 @@ def __init__(
if self.render_mode == "human":
self.clock = pygame.time.Clock()
- # Key
- # ----
- # blank space = 0
- # agent 0 = 1
- # agent 1 = 2
- # An observation is list of lists, where each list represents a row
- #
- # [[0,0,2]
- # [1,2,1]
- # [2,1,0]]
def observe(self, agent):
board_vals = np.array(self.board.squares).reshape(3, 3)
cur_player = self.possible_agents.index(agent)
opp_player = (cur_player + 1) % 2
- cur_p_board = np.equal(board_vals, cur_player + 1)
- opp_p_board = np.equal(board_vals, opp_player + 1)
-
- observation = np.stack([cur_p_board, opp_p_board], axis=2).astype(np.int8)
- legal_moves = self._legal_moves() if agent == self.agent_selection else []
+ observation = np.empty((3, 3, 2), dtype=np.int8)
+ # this will give a copy of the board that is 1 for player 1's
+ # marks and zero for every other square, whether empty or not.
+ observation[:, :, 0] = np.equal(board_vals, cur_player + 1)
+ observation[:, :, 1] = np.equal(board_vals, opp_player + 1)
- action_mask = np.zeros(9, "int8")
- for i in legal_moves:
- action_mask[i] = 1
+ action_mask = self._get_mask(agent)
return {"observation": observation, "action_mask": action_mask}
+ def _get_mask(self, agent):
+ action_mask = np.zeros(9, dtype=np.int8)
+
+ # Per the documentation, the mask of any agent other than the
+ # currently selected one is all zeros.
+ if agent == self.agent_selection:
+ for i in self.board.legal_moves():
+ action_mask[i] = 1
+
+ return action_mask
+
def observation_space(self, agent):
return self.observation_spaces[agent]
def action_space(self, agent):
return self.action_spaces[agent]
- def _legal_moves(self):
- return [i for i in range(len(self.board.squares)) if self.board.squares[i] == 0]
-
# action in this case is a value from 0 to 8 indicating position to move on tictactoe board
def step(self, action):
if (
@@ -196,45 +194,30 @@ def step(self, action):
or self.truncations[self.agent_selection]
):
return self._was_dead_step(action)
- # check if input action is a valid move (0 == empty spot)
- assert self.board.squares[action] == 0, "played illegal move"
- # play turn
- self.board.play_turn(self.agents.index(self.agent_selection), action)
-
- # update infos
- # list of valid actions (indexes in board)
- # next_agent = self.agents[(self.agents.index(self.agent_selection) + 1) % len(self.agents)]
- next_agent = self._agent_selector.next()
- if self.board.check_game_over():
- winner = self.board.check_for_winner()
+ self.board.play_turn(self.agents.index(self.agent_selection), action)
- if winner == -1:
- # tie
+ status = self.board.game_status()
+ if status != TTT_GAME_NOT_OVER:
+ if status == TTT_TIE:
pass
- elif winner == 1:
- # agent 0 won
- self.rewards[self.agents[0]] += 1
- self.rewards[self.agents[1]] -= 1
else:
- # agent 1 won
- self.rewards[self.agents[1]] += 1
- self.rewards[self.agents[0]] -= 1
+ winner = status # either TTT_PLAYER1_WIN or TTT_PLAYER2_WIN
+ loser = winner ^ 1 # 0 -> 1; 1 -> 0
+ self.rewards[self.agents[winner]] += 1
+ self.rewards[self.agents[loser]] -= 1
# once either play wins or there is a draw, game over, both players are done
self.terminations = {i: True for i in self.agents}
+ self._accumulate_rewards()
- # Switch selection to next agents
- self._cumulative_rewards[self.agent_selection] = 0
- self.agent_selection = next_agent
+ self.agent_selection = self._agent_selector.next()
- self._accumulate_rewards()
if self.render_mode == "human":
self.render()
def reset(self, seed=None, options=None):
- # reset environment
- self.board = Board()
+ self.board.reset()
self.agents = self.possible_agents[:]
self.rewards = {i: 0 for i in self.agents}
@@ -244,10 +227,9 @@ def reset(self, seed=None, options=None):
self.infos = {i: {} for i in self.agents}
# selects the first agent
self._agent_selector.reinit(self.agents)
- self._agent_selector.reset()
self.agent_selection = self._agent_selector.reset()
- if self.screen is None:
+ if self.render_mode is not None and self.screen is None:
pygame.init()
if self.render_mode == "human":
@@ -255,7 +237,7 @@ def reset(self, seed=None, options=None):
(self.screen_height, self.screen_height)
)
pygame.display.set_caption("Tic-Tac-Toe")
- else:
+ elif self.render_mode == "rgb_array":
self.screen = pygame.Surface((self.screen_height, self.screen_height))
def close(self):
diff --git a/tutorials/SB3/test/test_sb3_action_mask.py b/tutorials/SB3/test/test_sb3_action_mask.py
index de4ee3c07..2be85b1d8 100644
--- a/tutorials/SB3/test/test_sb3_action_mask.py
+++ b/tutorials/SB3/test/test_sb3_action_mask.py
@@ -91,7 +91,7 @@ def test_action_mask_medium(env_fn):
assert (
winrate < 0.75
- ), "Policy should not perform better than 75% winrate" # 30-40% for leduc, 0% for hanabi, 0% for tic-tac-toe
+ ), "Policy should not perform better than 75% winrate" # 30-40% for leduc, 0% for hanabi
# Watch two games (disabled by default)
# eval_action_mask(env_fn, num_games=2, render_mode="human", **env_kwargs)
From 46d92e024c1382a2568fce2dd517c186bd6bb775 Mon Sep 17 00:00:00 2001
From: "HP (Hetav)" <60848863+pandyah5@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:26:07 -0700
Subject: [PATCH 07/22] Updated documentation of observation format in Simple
World Comm (#1212)
---
pettingzoo/mpe/simple_world_comm/simple_world_comm.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pettingzoo/mpe/simple_world_comm/simple_world_comm.py b/pettingzoo/mpe/simple_world_comm/simple_world_comm.py
index 598c0d23e..2601dd76e 100644
--- a/pettingzoo/mpe/simple_world_comm/simple_world_comm.py
+++ b/pettingzoo/mpe/simple_world_comm/simple_world_comm.py
@@ -30,11 +30,11 @@
In particular, the good agents reward, is -5 for every collision with an adversary, -2 x bound by the `bound` function described in simple_tag, +2 for every collision with a food, and -0.05 x minimum distance to any food. The adversarial agents are rewarded +5 for collisions and -0.1 x minimum
distance to a good agent. s
-Good agent observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, self_in_forest]`
+Good agent observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, self_in_forest, other_agent_velocities]`
Normal adversary observations:`[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, self_in_forest, leader_comm]`
-Adversary leader observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, leader_comm]`
+Adversary leader observations: `[self_vel, self_pos, landmark_rel_positions, other_agent_rel_positions, other_agent_velocities, self_in_forest, leader_comm]`
*Note that when the forests prevent an agent from being seen, the observation of that agents relative position is set to (0,0).*
From 4ecc0e1ce41994d19e89e72806b37f07b79f2356 Mon Sep 17 00:00:00 2001
From: ajmeek <61296971+ajmeek@users.noreply.github.com>
Date: Thu, 20 Jun 2024 21:27:51 -0400
Subject: [PATCH 08/22] Changed dead link to correct link in Tianshou tutorial
(#1214)
---
docs/tutorials/tianshou/index.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/tutorials/tianshou/index.md b/docs/tutorials/tianshou/index.md
index eef3a7d0c..1a879f12c 100644
--- a/docs/tutorials/tianshou/index.md
+++ b/docs/tutorials/tianshou/index.md
@@ -21,7 +21,7 @@ It boasts a large number of algorithms and high quality software engineering sta
## Examples using PettingZoo
-* [Multi-Agent RL](https://tianshou.readthedocs.io/en/master/tutorials/tictactoe.html)
+* [Multi-Agent RL](https://tianshou.org/en/master/01_tutorials/04_tictactoe.html)
## Architecture
From 9f441feb56772e1a08c5998318d66b9989ea75b0 Mon Sep 17 00:00:00 2001
From: Jannik Hinrichs <58370727+Zoraiyo@users.noreply.github.com>
Date: Fri, 21 Jun 2024 03:32:03 +0200
Subject: [PATCH 09/22] Fix: Swapped colors in waterworld description (#1210)
---
pettingzoo/sisl/waterworld/waterworld.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pettingzoo/sisl/waterworld/waterworld.py b/pettingzoo/sisl/waterworld/waterworld.py
index 7684d7206..df1e31549 100644
--- a/pettingzoo/sisl/waterworld/waterworld.py
+++ b/pettingzoo/sisl/waterworld/waterworld.py
@@ -33,7 +33,7 @@
poison respectively. The number of features per sensor is 8 by default with `speed_features` enabled, or 5 if `speed_features` is turned off. Therefore with `speed_features` enabled, the observation shape takes the full form of `(8 × n_sensors) + 2`. Elements of the observation vector take on
values in the range [-1, 1].
-For example, by default there are 5 agents (purple), 5 food targets (red) and 10 poison targets (green). Each agent has 30 range-limited sensors, depicted by the black lines, to detect neighboring entities (food and poison targets) resulting in 242 element vector of computed values about the
+For example, by default there are 5 agents (purple), 5 food targets (green) and 10 poison targets (red). Each agent has 30 range-limited sensors, depicted by the black lines, to detect neighboring entities (food and poison targets) resulting in 242 element vector of computed values about the
environment for the observation space. These values represent the distances and speeds sensed by each sensor on the archea. Sensors that do not sense any objects within their range report 0 for speed and 1 for distance.
This has been fixed from the reference environments to keep items floating off screen and being lost forever.
From 1eef080e59d4e81503d2138d42116edcea5224c4 Mon Sep 17 00:00:00 2001
From: David Ackerman <145808634+dm-ackerman@users.noreply.github.com>
Date: Thu, 20 Jun 2024 21:40:43 -0400
Subject: [PATCH 10/22] TerminateIllegalWrapper fix (#1206)
---
.../utils/wrappers/terminate_illegal.py | 11 ++-
test/wrapper_test.py | 71 ++++++++++++++++++-
2 files changed, 73 insertions(+), 9 deletions(-)
diff --git a/pettingzoo/utils/wrappers/terminate_illegal.py b/pettingzoo/utils/wrappers/terminate_illegal.py
index a49d9a0be..79f95504a 100644
--- a/pettingzoo/utils/wrappers/terminate_illegal.py
+++ b/pettingzoo/utils/wrappers/terminate_illegal.py
@@ -1,4 +1,3 @@
-# pyright reportGeneralTypeIssues=false
from __future__ import annotations
from pettingzoo.utils.env import ActionType, AECEnv, AgentID, ObsType
@@ -20,6 +19,7 @@ def __init__(
self._illegal_value = illegal_reward
self._prev_obs = None
self._prev_info = None
+ self._terminated = False # terminated by an illegal move
def reset(self, seed: int | None = None, options: dict | None = None) -> None:
self._terminated = False
@@ -42,7 +42,6 @@ def step(self, action: ActionType) -> None:
if self._prev_obs is None:
self.observe(self.agent_selection)
if isinstance(self._prev_obs, dict):
- assert self._prev_obs is not None
assert (
"action_mask" in self._prev_obs
), f"`action_mask` not found in dictionary observation: {self._prev_obs}. Action mask must either be in `observation['action_mask']` or `info['action_mask']` to use TerminateIllegalWrapper."
@@ -60,7 +59,7 @@ def step(self, action: ActionType) -> None:
self.terminations[self.agent_selection]
or self.truncations[self.agent_selection]
):
- self._was_dead_step(action) # pyright: ignore[reportGeneralTypeIssues]
+ self.env.unwrapped._was_dead_step(action)
elif (
not self.terminations[self.agent_selection]
and not self.truncations[self.agent_selection]
@@ -70,12 +69,10 @@ def step(self, action: ActionType) -> None:
self.env.unwrapped._cumulative_rewards[self.agent_selection] = 0
self.env.unwrapped.terminations = {d: True for d in self.agents}
self.env.unwrapped.truncations = {d: True for d in self.agents}
- self._prev_obs = None
- self._prev_info = None
self.env.unwrapped.rewards = {d: 0 for d in self.truncations}
self.env.unwrapped.rewards[current_agent] = float(self._illegal_value)
- self._accumulate_rewards()
- self._deads_step_first()
+ self.env.unwrapped._accumulate_rewards()
+ self.env.unwrapped._deads_step_first()
self._terminated = True
else:
super().step(action)
diff --git a/test/wrapper_test.py b/test/wrapper_test.py
index 650fe328b..a03bd81b3 100644
--- a/test/wrapper_test.py
+++ b/test/wrapper_test.py
@@ -3,8 +3,13 @@
import pytest
from pettingzoo.butterfly import pistonball_v6
-from pettingzoo.classic import texas_holdem_no_limit_v6
-from pettingzoo.utils.wrappers import MultiEpisodeEnv, MultiEpisodeParallelEnv
+from pettingzoo.classic import texas_holdem_no_limit_v6, tictactoe_v3
+from pettingzoo.utils.wrappers import (
+ BaseWrapper,
+ MultiEpisodeEnv,
+ MultiEpisodeParallelEnv,
+ TerminateIllegalWrapper,
+)
@pytest.mark.parametrize(("num_episodes"), [1, 2, 3, 4, 5, 6])
@@ -67,3 +72,65 @@ def test_multi_episode_parallel_env_wrapper(num_episodes) -> None:
assert (
steps == num_episodes * 125
), f"Expected to have 125 steps per episode, got {steps / num_episodes}."
+
+
+def _do_game(env: TerminateIllegalWrapper, seed: int) -> None:
+ """Run a single game with reproducible random moves."""
+ assert isinstance(
+ env, TerminateIllegalWrapper
+ ), "test_terminate_illegal must use TerminateIllegalWrapper"
+ env.reset(seed)
+ for agent in env.agents:
+ # make the random moves reproducible
+ env.action_space(agent).seed(seed)
+
+ for agent in env.agent_iter():
+ _, _, termination, truncation, _ = env.last()
+
+ if termination or truncation:
+ env.step(None)
+ else:
+ action = env.action_space(agent).sample()
+ env.step(action)
+
+
+def test_terminate_illegal() -> None:
+ """Test for a problem with terminate illegal wrapper.
+
+ The problem is that env variables, including agent_selection, are set by
+ calls from TerminateIllegalWrapper to env functions. However, they are
+ called by the wrapper object, not the env so they are set in the wrapper
+ object rather than the base env object. When the code later tries to run,
+ the values get updated in the env code, but the wrapper pulls it's own
+ values that shadow them.
+
+ The test here confirms that is fixed.
+ """
+ # not using env() because we need to ensure that the env is
+ # wrapped by TerminateIllegalWrapper
+ raw_env = tictactoe_v3.raw_env()
+ env = TerminateIllegalWrapper(raw_env, illegal_reward=-1)
+
+ _do_game(env, 42)
+ # bug is triggered by a corrupted state after a game is terminated
+ # due to an illegal move. So we need to run the game twice to
+ # see the effect.
+ _do_game(env, 42)
+
+ # get a list of what all the agent_selection values in the wrapper stack
+ unwrapped = env
+ agent_selections = []
+ while unwrapped != env.unwrapped:
+ # the actual value for this wrapper (or None if no value)
+ agent_selections.append(unwrapped.__dict__.get("agent_selection", None))
+ assert isinstance(unwrapped, BaseWrapper)
+ unwrapped = unwrapped.env
+
+ # last one from the actual env
+ agent_selections.append(unwrapped.__dict__.get("agent_selection", None))
+
+ # remove None from agent_selections
+ agent_selections = [x for x in agent_selections if x is not None]
+
+ # all values must be the same, or else the wrapper and env are mismatched
+ assert len(set(agent_selections)) == 1, "agent_selection mismatch"
From 1282a0a1624c0dc92e3b70f1ba65f65c6b265499 Mon Sep 17 00:00:00 2001
From: David Ackerman <145808634+dm-ackerman@users.noreply.github.com>
Date: Thu, 20 Jun 2024 21:49:27 -0400
Subject: [PATCH 11/22] Order enforcing wrapper fix (#1205)
---
pettingzoo/utils/env_logger.py | 14 -----
pettingzoo/utils/wrappers/order_enforcing.py | 60 +++++++-------------
2 files changed, 19 insertions(+), 55 deletions(-)
diff --git a/pettingzoo/utils/env_logger.py b/pettingzoo/utils/env_logger.py
index c5e640e47..bd505e2e3 100644
--- a/pettingzoo/utils/env_logger.py
+++ b/pettingzoo/utils/env_logger.py
@@ -61,20 +61,6 @@ def warn_action_out_of_bound(
f"[WARNING]: Received an action {action} that was outside action space {action_space}. Environment is {backup_policy}"
)
- @staticmethod
- def warn_close_unrendered_env() -> None:
- """Warns: ``[WARNING]: Called close on an unrendered environment.``."""
- EnvLogger._generic_warning(
- "[WARNING]: Called close on an unrendered environment."
- )
-
- @staticmethod
- def warn_close_before_reset() -> None:
- """Warns: ``[WARNING]: reset() needs to be called before close.``."""
- EnvLogger._generic_warning(
- "[WARNING]: reset() needs to be called before close."
- )
-
@staticmethod
def warn_on_illegal_move() -> None:
"""Warns: ``[WARNING]: Illegal move made, game terminating with current player losing.``."""
diff --git a/pettingzoo/utils/wrappers/order_enforcing.py b/pettingzoo/utils/wrappers/order_enforcing.py
index 649c23caa..4a1255682 100644
--- a/pettingzoo/utils/wrappers/order_enforcing.py
+++ b/pettingzoo/utils/wrappers/order_enforcing.py
@@ -19,11 +19,13 @@
class OrderEnforcingWrapper(BaseWrapper[AgentID, ObsType, ActionType]):
"""Checks if function calls or attribute access are in a disallowed order.
- * error on getting rewards, terminations, truncations, infos, agent_selection before reset
- * error on calling step, observe before reset
- * error on iterating without stepping or resetting environment.
- * warn on calling close before render or reset
- * warn on calling step after environment is terminated or truncated
+ The following are raised:
+ * AttributeError if any of the following are accessed before reset():
+ rewards, terminations, truncations, infos, agent_selection,
+ num_agents, agents.
+ * An error if any of the following are called before reset:
+ render(), step(), observe(), state(), agent_iter()
+ * A warning if step() is called when there are no agents remaining.
"""
def __init__(self, env: AECEnv[AgentID, ObsType, ActionType]):
@@ -31,37 +33,12 @@ def __init__(self, env: AECEnv[AgentID, ObsType, ActionType]):
env, AECEnv
), "OrderEnforcingWrapper is only compatible with AEC environments"
self._has_reset = False
- self._has_rendered = False
self._has_updated = False
super().__init__(env)
def __getattr__(self, value: str) -> Any:
- """Raises an error message when data is gotten from the env.
-
- Should only be gotten after reset
- """
- if value == "unwrapped":
- return self.env.unwrapped
- elif value == "render_mode" and hasattr(self.env, "render_mode"):
- return self.env.render_mode # pyright: ignore[reportGeneralTypeIssues]
- elif value == "possible_agents":
- try:
- return self.env.possible_agents
- except AttributeError:
- EnvLogger.error_possible_agents_attribute_missing("possible_agents")
- elif value == "observation_spaces":
- raise AttributeError(
- "The base environment does not have an possible_agents attribute. Use the environments `observation_space` method instead"
- )
- elif value == "action_spaces":
- raise AttributeError(
- "The base environment does not have an possible_agents attribute. Use the environments `action_space` method instead"
- )
- elif value == "agent_order":
- raise AttributeError(
- "agent_order has been removed from the API. Please consider using agent_iter instead."
- )
- elif (
+ """Raises an error if certain data is accessed before reset."""
+ if (
value
in {
"rewards",
@@ -75,13 +52,11 @@ def __getattr__(self, value: str) -> Any:
and not self._has_reset
):
raise AttributeError(f"{value} cannot be accessed before reset")
- else:
- return super().__getattr__(value)
+ return super().__getattr__(value)
def render(self) -> None | np.ndarray | str | list:
if not self._has_reset:
EnvLogger.error_render_before_reset()
- self._has_rendered = True
return super().render()
def step(self, action: ActionType) -> None:
@@ -90,7 +65,6 @@ def step(self, action: ActionType) -> None:
elif not self.agents:
self._has_updated = True
EnvLogger.warn_step_after_terminated_truncated()
- return None
else:
self._has_updated = True
super().step(action)
@@ -124,8 +98,7 @@ def __str__(self) -> str:
if self.__class__ is OrderEnforcingWrapper
else f"{type(self).__name__}<{str(self.env)}>"
)
- else:
- return repr(self)
+ return repr(self)
class AECOrderEnforcingIterable(AECIterable[AgentID, ObsType, ActionType]):
@@ -134,11 +107,16 @@ def __iter__(self) -> AECOrderEnforcingIterator[AgentID, ObsType, ActionType]:
class AECOrderEnforcingIterator(AECIterator[AgentID, ObsType, ActionType]):
+ def __init__(
+ self, env: OrderEnforcingWrapper[AgentID, ObsType, ActionType], max_iter: int
+ ):
+ assert isinstance(
+ env, OrderEnforcingWrapper
+ ), "env must be wrapped by OrderEnforcingWrapper"
+ super().__init__(env, max_iter)
+
def __next__(self) -> AgentID:
agent = super().__next__()
- assert hasattr(
- self.env, "_has_updated"
- ), "env must be wrapped by OrderEnforcingWrapper"
assert (
self.env._has_updated # pyright: ignore[reportGeneralTypeIssues]
), "need to call step() or reset() in a loop over `agent_iter`"
From 9434bd7113b883aa9c223f3c790e4146b3933627 Mon Sep 17 00:00:00 2001
From: "HP (Hetav)" <60848863+pandyah5@users.noreply.github.com>
Date: Tue, 25 Jun 2024 18:30:40 -0700
Subject: [PATCH 12/22] Added radius rescaling to simple env (#1213)
---
pettingzoo/mpe/_mpe_utils/simple_env.py | 26 ++++++++++++++-----
pettingzoo/mpe/simple/simple.py | 13 ++++++++--
.../mpe/simple_adversary/simple_adversary.py | 14 ++++++++--
pettingzoo/mpe/simple_crypto/simple_crypto.py | 13 ++++++++--
pettingzoo/mpe/simple_push/simple_push.py | 14 ++++++++--
.../mpe/simple_reference/simple_reference.py | 12 +++++++--
.../simple_speaker_listener.py | 13 ++++++++--
pettingzoo/mpe/simple_spread/simple_spread.py | 6 ++++-
pettingzoo/mpe/simple_tag/simple_tag.py | 6 ++++-
.../simple_world_comm/simple_world_comm.py | 6 ++++-
10 files changed, 102 insertions(+), 21 deletions(-)
diff --git a/pettingzoo/mpe/_mpe_utils/simple_env.py b/pettingzoo/mpe/_mpe_utils/simple_env.py
index af95b64d4..6cc9bb3d2 100644
--- a/pettingzoo/mpe/_mpe_utils/simple_env.py
+++ b/pettingzoo/mpe/_mpe_utils/simple_env.py
@@ -42,6 +42,7 @@ def __init__(
render_mode=None,
continuous_actions=False,
local_ratio=None,
+ dynamic_rescaling=False,
):
super().__init__()
@@ -66,6 +67,7 @@ def __init__(
self.world = world
self.continuous_actions = continuous_actions
self.local_ratio = local_ratio
+ self.dynamic_rescaling = dynamic_rescaling
self.scenario.reset_world(self.world, self.np_random)
@@ -116,6 +118,11 @@ def __init__(
dtype=np.float32,
)
+ # Get the original cam_range
+ # This will be used to scale the rendering
+ all_poses = [entity.state.p_pos for entity in self.world.entities]
+ self.original_cam_range = np.max(np.abs(np.array(all_poses)))
+
self.steps = 0
self.current_actions = [None] * self.num_agents
@@ -295,6 +302,10 @@ def draw(self):
all_poses = [entity.state.p_pos for entity in self.world.entities]
cam_range = np.max(np.abs(np.array(all_poses)))
+ # The scaling factor is used for dynamic rescaling of the rendering - a.k.a Zoom In/Zoom Out effect
+ # The 0.9 is a factor to keep the entities from appearing "too" out-of-bounds
+ scaling_factor = 0.9 * self.original_cam_range / cam_range
+
# update geometry and text positions
text_line = 0
for e, entity in enumerate(self.world.entities):
@@ -309,12 +320,15 @@ def draw(self):
y = (y / cam_range) * self.height // 2 * 0.9
x += self.width // 2
y += self.height // 2
- pygame.draw.circle(
- self.screen, entity.color * 200, (x, y), entity.size * 350
- ) # 350 is an arbitrary scale factor to get pygame to render similar sizes as pyglet
- pygame.draw.circle(
- self.screen, (0, 0, 0), (x, y), entity.size * 350, 1
- ) # borders
+
+ # 350 is an arbitrary scale factor to get pygame to render similar sizes as pyglet
+ if self.dynamic_rescaling:
+ radius = entity.size * 350 * scaling_factor
+ else:
+ radius = entity.size * 350
+
+ pygame.draw.circle(self.screen, entity.color * 200, (x, y), radius)
+ pygame.draw.circle(self.screen, (0, 0, 0), (x, y), radius, 1) # borders
assert (
0 < x < self.width and 0 < y < self.height
), f"Coordinates {(x, y)} are out of bounds."
diff --git a/pettingzoo/mpe/simple/simple.py b/pettingzoo/mpe/simple/simple.py
index b9d6f255a..7431c4fb1 100644
--- a/pettingzoo/mpe/simple/simple.py
+++ b/pettingzoo/mpe/simple/simple.py
@@ -31,7 +31,7 @@
### Arguments
``` python
-simple_v3.env(max_cycles=25, continuous_actions=False)
+simple_v3.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -40,6 +40,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -52,7 +54,13 @@
class raw_env(SimpleEnv, EzPickle):
- def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
+ def __init__(
+ self,
+ max_cycles=25,
+ continuous_actions=False,
+ render_mode=None,
+ dynamic_rescaling=False,
+ ):
EzPickle.__init__(
self,
max_cycles=max_cycles,
@@ -68,6 +76,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_v3"
diff --git a/pettingzoo/mpe/simple_adversary/simple_adversary.py b/pettingzoo/mpe/simple_adversary/simple_adversary.py
index 674790c38..cf7a38499 100644
--- a/pettingzoo/mpe/simple_adversary/simple_adversary.py
+++ b/pettingzoo/mpe/simple_adversary/simple_adversary.py
@@ -39,7 +39,7 @@
### Arguments
``` python
-simple_adversary_v3.env(N=2, max_cycles=25, continuous_actions=False)
+simple_adversary_v3.env(N=2, max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -50,6 +50,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -62,7 +64,14 @@
class raw_env(SimpleEnv, EzPickle):
- def __init__(self, N=2, max_cycles=25, continuous_actions=False, render_mode=None):
+ def __init__(
+ self,
+ N=2,
+ max_cycles=25,
+ continuous_actions=False,
+ render_mode=None,
+ dynamic_rescaling=False,
+ ):
EzPickle.__init__(
self,
N=N,
@@ -79,6 +88,7 @@ def __init__(self, N=2, max_cycles=25, continuous_actions=False, render_mode=Non
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_adversary_v3"
diff --git a/pettingzoo/mpe/simple_crypto/simple_crypto.py b/pettingzoo/mpe/simple_crypto/simple_crypto.py
index 66a8d2ad1..f74b5f0d1 100644
--- a/pettingzoo/mpe/simple_crypto/simple_crypto.py
+++ b/pettingzoo/mpe/simple_crypto/simple_crypto.py
@@ -45,7 +45,7 @@
### Arguments
``` python
-simple_crypto_v3.env(max_cycles=25, continuous_actions=False)
+simple_crypto_v3.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -54,6 +54,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -73,7 +75,13 @@
class raw_env(SimpleEnv, EzPickle):
- def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
+ def __init__(
+ self,
+ max_cycles=25,
+ continuous_actions=False,
+ render_mode=None,
+ dynamic_rescaling=False,
+ ):
EzPickle.__init__(
self,
max_cycles=max_cycles,
@@ -89,6 +97,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_crypto_v3"
diff --git a/pettingzoo/mpe/simple_push/simple_push.py b/pettingzoo/mpe/simple_push/simple_push.py
index 1a11a98d8..46b352803 100644
--- a/pettingzoo/mpe/simple_push/simple_push.py
+++ b/pettingzoo/mpe/simple_push/simple_push.py
@@ -38,13 +38,16 @@
### Arguments
``` python
-simple_push_v3.env(max_cycles=25, continuous_actions=False)
+simple_push_v3.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
`max_cycles`: number of frames (a step for each agent) until game terminates
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
+
"""
import numpy as np
@@ -57,7 +60,13 @@
class raw_env(SimpleEnv, EzPickle):
- def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
+ def __init__(
+ self,
+ max_cycles=25,
+ continuous_actions=False,
+ render_mode=None,
+ dynamic_rescaling=False,
+ ):
EzPickle.__init__(
self,
max_cycles=max_cycles,
@@ -73,6 +82,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_push_v3"
diff --git a/pettingzoo/mpe/simple_reference/simple_reference.py b/pettingzoo/mpe/simple_reference/simple_reference.py
index a934b9014..d058e7d21 100644
--- a/pettingzoo/mpe/simple_reference/simple_reference.py
+++ b/pettingzoo/mpe/simple_reference/simple_reference.py
@@ -40,7 +40,7 @@
``` python
-simple_reference_v3.env(local_ratio=0.5, max_cycles=25, continuous_actions=False)
+simple_reference_v3.env(local_ratio=0.5, max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -51,6 +51,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -64,7 +66,12 @@
class raw_env(SimpleEnv, EzPickle):
def __init__(
- self, local_ratio=0.5, max_cycles=25, continuous_actions=False, render_mode=None
+ self,
+ local_ratio=0.5,
+ max_cycles=25,
+ continuous_actions=False,
+ render_mode=None,
+ dynamic_rescaling=False,
):
EzPickle.__init__(
self,
@@ -86,6 +93,7 @@ def __init__(
max_cycles=max_cycles,
continuous_actions=continuous_actions,
local_ratio=local_ratio,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_reference_v3"
diff --git a/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py b/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py
index fbfbe9c85..4fc09e6a3 100644
--- a/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py
+++ b/pettingzoo/mpe/simple_speaker_listener/simple_speaker_listener.py
@@ -37,7 +37,7 @@
### Arguments
``` python
-simple_speaker_listener_v4.env(max_cycles=25, continuous_actions=False)
+simple_speaker_listener_v4.env(max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -46,6 +46,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -58,7 +60,13 @@
class raw_env(SimpleEnv, EzPickle):
- def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
+ def __init__(
+ self,
+ max_cycles=25,
+ continuous_actions=False,
+ render_mode=None,
+ dynamic_rescaling=False,
+ ):
EzPickle.__init__(
self,
max_cycles=max_cycles,
@@ -74,6 +82,7 @@ def __init__(self, max_cycles=25, continuous_actions=False, render_mode=None):
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_speaker_listener_v4"
diff --git a/pettingzoo/mpe/simple_spread/simple_spread.py b/pettingzoo/mpe/simple_spread/simple_spread.py
index 83e79e53e..4313780ae 100644
--- a/pettingzoo/mpe/simple_spread/simple_spread.py
+++ b/pettingzoo/mpe/simple_spread/simple_spread.py
@@ -36,7 +36,7 @@
### Arguments
``` python
-simple_spread_v3.env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False)
+simple_spread_v3.env(N=3, local_ratio=0.5, max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -49,6 +49,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -68,6 +70,7 @@ def __init__(
max_cycles=25,
continuous_actions=False,
render_mode=None,
+ dynamic_rescaling=False,
):
EzPickle.__init__(
self,
@@ -90,6 +93,7 @@ def __init__(
max_cycles=max_cycles,
continuous_actions=continuous_actions,
local_ratio=local_ratio,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_spread_v3"
diff --git a/pettingzoo/mpe/simple_tag/simple_tag.py b/pettingzoo/mpe/simple_tag/simple_tag.py
index 7727eb425..1f6c3b48f 100644
--- a/pettingzoo/mpe/simple_tag/simple_tag.py
+++ b/pettingzoo/mpe/simple_tag/simple_tag.py
@@ -45,7 +45,7 @@ def bound(x):
### Arguments
``` python
-simple_tag_v3.env(num_good=1, num_adversaries=3, num_obstacles=2, max_cycles=25, continuous_actions=False)
+simple_tag_v3.env(num_good=1, num_adversaries=3, num_obstacles=2, max_cycles=25, continuous_actions=False, dynamic_rescaling=False)
```
@@ -60,6 +60,8 @@ def bound(x):
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -80,6 +82,7 @@ def __init__(
max_cycles=25,
continuous_actions=False,
render_mode=None,
+ dynamic_rescaling=False,
):
EzPickle.__init__(
self,
@@ -99,6 +102,7 @@ def __init__(
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_tag_v3"
diff --git a/pettingzoo/mpe/simple_world_comm/simple_world_comm.py b/pettingzoo/mpe/simple_world_comm/simple_world_comm.py
index 2601dd76e..0f2932743 100644
--- a/pettingzoo/mpe/simple_world_comm/simple_world_comm.py
+++ b/pettingzoo/mpe/simple_world_comm/simple_world_comm.py
@@ -52,7 +52,7 @@
``` python
simple_world_comm_v3.env(num_good=2, num_adversaries=4, num_obstacles=1,
- num_food=2, max_cycles=25, num_forests=2, continuous_actions=False)
+ num_food=2, max_cycles=25, num_forests=2, continuous_actions=False, dynamic_rescaling=False)
```
@@ -71,6 +71,8 @@
`continuous_actions`: Whether agent action spaces are discrete(default) or continuous
+`dynamic_rescaling`: Whether to rescale the size of agents and landmarks based on the screen size
+
"""
import numpy as np
@@ -93,6 +95,7 @@ def __init__(
num_forests=2,
continuous_actions=False,
render_mode=None,
+ dynamic_rescaling=False,
):
EzPickle.__init__(
self,
@@ -116,6 +119,7 @@ def __init__(
render_mode=render_mode,
max_cycles=max_cycles,
continuous_actions=continuous_actions,
+ dynamic_rescaling=dynamic_rescaling,
)
self.metadata["name"] = "simple_world_comm_v3"
From 4d75d86d39e4b8f01d224d3875896087d1aae217 Mon Sep 17 00:00:00 2001
From: Xihuai Wang
Date: Tue, 2 Jul 2024 14:59:23 +0800
Subject: [PATCH 13/22] Add an easy-to-install gfootball environment, SMAC and
SMACv2 with pettingzoo apis in `third_party_envs` (#1217)
---
docs/environments/third_party_envs.md | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/docs/environments/third_party_envs.md b/docs/environments/third_party_envs.md
index 928904be9..167f14de9 100644
--- a/docs/environments/third_party_envs.md
+++ b/docs/environments/third_party_envs.md
@@ -12,6 +12,18 @@ lastpage:
## Environments using the latest versions of PettingZoo
*Due to a very recent major release of PettingZoo, there are currently few contributed third-party environments. If you'd like to contribute one, please reach out on [Discord](https://discord.gg/nHg2JRN489).*
+### [gfootball-gymnasium-pettingzoo](https://github.com/xihuai18/gfootball-gymnasium-pettingzoo)
+[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.24.3-blue)]()
+[![GitHub stars](https://img.shields.io/github/stars/xihuai18/gfootball-gymnasium-pettingzoo)]()
+
+Google Research Football ([GRF](https://github.com/google-research/football)) with Gymnasium and PettingZoo Compatibility.
+
+### [SMAC and SMACv2 with latest PettingZoo APIs](https://github.com/xihuai18/SMAC-PettingZoo)
+[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.24.3-blue)]()
+[![GitHub stars](https://img.shields.io/github/stars/xihuai18/gfootball-gymnasium-pettingzoo)]()
+
+[SMAC](https://github.com/oxwhirl/smac) and [SMACv2](https://github.com/oxwhirl/smacv2) with the latest PettingZoo Parallel APIs.
+
### [Sumo-RL](https://github.com/LucasAlegre/sumo-rl)
[![PettingZoo version dependency](https://img.shields.io/badge/PettingZoo-v1.22.2-blue)]()
From ace38942ba8815388353ddeb5e4f18a5f44bef61 Mon Sep 17 00:00:00 2001
From: Florin Pop
Date: Tue, 2 Jul 2024 08:59:47 +0200
Subject: [PATCH 14/22] Fix memory leak in waterworld_v4 by clearing old
handlers before adding new ones (#1218)
---
pettingzoo/sisl/waterworld/waterworld_base.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/pettingzoo/sisl/waterworld/waterworld_base.py b/pettingzoo/sisl/waterworld/waterworld_base.py
index b6705b76b..7c82da4e4 100644
--- a/pettingzoo/sisl/waterworld/waterworld_base.py
+++ b/pettingzoo/sisl/waterworld/waterworld_base.py
@@ -313,6 +313,8 @@ def draw(self):
def add_handlers(self):
# Collision handlers for pursuers v.s. evaders & poisons
+ self.handlers = []
+
for pursuer in self.pursuers:
for obj in self.evaders:
self.handlers.append(
From 6f6450000dff17f31efa81aef73a417e853b7be4 Mon Sep 17 00:00:00 2001
From: Omar Younis <42100908+younik@users.noreply.github.com>
Date: Sat, 3 Aug 2024 19:21:42 +0200
Subject: [PATCH 15/22] Add Python 3.12 support (#1226)
---
.github/workflows/build-publish.yml | 3 ++
.github/workflows/linux-test.yml | 2 +-
.github/workflows/macos-test.yml | 2 +-
docs/code_examples/aec_rps_usage.py | 2 +-
docs/code_examples/parallel_rps_usage.py | 2 +-
pyproject.toml | 41 ++++++++++++------------
tutorials/Tianshou/requirements.txt | 5 +--
7 files changed, 31 insertions(+), 26 deletions(-)
diff --git a/.github/workflows/build-publish.yml b/.github/workflows/build-publish.yml
index 88079fe4e..a4b69870a 100644
--- a/.github/workflows/build-publish.yml
+++ b/.github/workflows/build-publish.yml
@@ -31,6 +31,9 @@ jobs:
- os: ubuntu-latest
python: 311
platform: manylinux_x86_64
+ - os: ubuntu-latest
+ python: 312
+ platform: manylinux_x86_64
steps:
- uses: actions/checkout@v4
diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml
index 7a7139b6b..536cd2e72 100644
--- a/.github/workflows/linux-test.yml
+++ b/.github/workflows/linux-test.yml
@@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ['3.8', '3.9', '3.10', '3.11']
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml
index 82e7a3d15..83d91809a 100644
--- a/.github/workflows/macos-test.yml
+++ b/.github/workflows/macos-test.yml
@@ -15,7 +15,7 @@ jobs:
matrix:
# Big Sur, Monterey
os: [macos-11, macos-12]
- python-version: ['3.8', '3.9', '3.10', '3.11']
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
diff --git a/docs/code_examples/aec_rps_usage.py b/docs/code_examples/aec_rps_usage.py
index 71edc4e73..da7d2111d 100644
--- a/docs/code_examples/aec_rps_usage.py
+++ b/docs/code_examples/aec_rps_usage.py
@@ -1,4 +1,4 @@
-import aec_rps
+from . import aec_rps
env = aec_rps.env(render_mode="human")
env.reset(seed=42)
diff --git a/docs/code_examples/parallel_rps_usage.py b/docs/code_examples/parallel_rps_usage.py
index 38949eb78..a75aa153d 100644
--- a/docs/code_examples/parallel_rps_usage.py
+++ b/docs/code_examples/parallel_rps_usage.py
@@ -1,4 +1,4 @@
-import parallel_rps
+from . import parallel_rps
env = parallel_rps.parallel_env(render_mode="human")
observations, infos = env.reset()
diff --git a/pyproject.toml b/pyproject.toml
index c0160ab17..73c99ea4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
'Intended Audience :: Science/Research',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
]
@@ -31,34 +32,34 @@ dynamic = ["version"]
[project.optional-dependencies]
# Update dependencies in `all` if any are added or removed
-atari = ["multi_agent_ale_py==0.1.11", "pygame==2.3.0"]
+atari = ["multi_agent_ale_py>=0.1.11", "pygame>=2.3.0"]
classic = [
- "chess==1.9.4",
- "rlcard==1.0.5",
- "pygame==2.3.0",
+ "chess>=1.9.4",
+ "rlcard>=1.0.5",
+ "pygame>=2.3.0",
"shimmy[openspiel]>=1.2.0"
]
-butterfly = ["pygame==2.3.0", "pymunk==6.2.0"]
-mpe = ["pygame==2.3.0"]
-sisl = ["pygame==2.3.0", "pymunk==6.2.0", "box2d-py==2.3.5", "scipy>=1.4.1"]
+butterfly = ["pygame>=2.3.0", "pymunk>=6.2.0"]
+mpe = ["pygame>=2.3.0"]
+sisl = ["pygame>=2.3.0", "pymunk>=6.2.0", "box2d-py>=2.3.5", "scipy>=1.4.1"]
other = ["pillow>=8.0.1"]
testing = [
- "pynput==1.7.6",
- "pytest==8.0.0",
- "AutoROM==0.6.1",
- "pytest-cov==4.1.0",
- "pytest-xdist==3.5.0",
- "pre-commit==3.5.0",
- "pytest-markdown-docs==0.5.0"
+ "pynput>=1.7.6",
+ "pytest>=8.0.0",
+ "AutoROM>=0.6.1",
+ "pytest-cov>=4.1.0",
+ "pytest-xdist>=3.5.0",
+ "pre-commit>=3.5.0",
+ "pytest-markdown-docs>=0.5.0"
]
all = [
- "multi_agent_ale_py==0.1.11",
- "pygame==2.3.0",
- "chess==1.9.4",
- "rlcard==1.0.5",
+ "multi_agent_ale_py>=0.1.11",
+ "pygame>=2.3.0",
+ "chess>=1.9.4",
+ "rlcard>=1.0.5",
"shimmy[openspiel]>=1.2.0",
- "pymunk==6.2.0",
- "box2d-py==2.3.5",
+ "pymunk>=6.2.0",
+ "box2d-py>=2.3.5",
"scipy>=1.4.1",
"pillow>=8.0.1",
]
diff --git a/tutorials/Tianshou/requirements.txt b/tutorials/Tianshou/requirements.txt
index b7b8d4a47..b92064488 100644
--- a/tutorials/Tianshou/requirements.txt
+++ b/tutorials/Tianshou/requirements.txt
@@ -1,3 +1,4 @@
-pettingzoo[classic]==1.23.0
-packaging==21.3
+numpy<2.0.0
+pettingzoo[classic]>=1.23.0
+packaging>=21.3
tianshou==0.5.0
From cb566189c21ce9183a7d06cb411c5892715d07df Mon Sep 17 00:00:00 2001
From: Mark Towers
Date: Mon, 12 Aug 2024 17:03:10 +0100
Subject: [PATCH 16/22] Bump version to 1.24.5
---
pettingzoo/__init__.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pettingzoo/__init__.py b/pettingzoo/__init__.py
index bef7e58da..4000bdbdc 100644
--- a/pettingzoo/__init__.py
+++ b/pettingzoo/__init__.py
@@ -12,7 +12,7 @@
os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "hide"
-__version__ = "1.24.3"
+__version__ = "1.24.4"
try:
import sys
From fddc36909d19497cb054fbf560618eba6825c178 Mon Sep 17 00:00:00 2001
From: mikepratt1
Date: Fri, 25 Oct 2024 14:39:43 +0100
Subject: [PATCH 17/22] update agilerl tutorials
---
docs/tutorials/agilerl/DQN.md | 242 +++++++--------
docs/tutorials/agilerl/MADDPG.md | 2 +-
tutorials/AgileRL/agilerl_dqn_curriculum.py | 269 ++++++++--------
tutorials/AgileRL/agilerl_maddpg.py | 327 ++++++++++----------
tutorials/AgileRL/agilerl_matd3.py | 220 +++++++------
tutorials/AgileRL/render_agilerl_dqn.py | 37 +--
tutorials/AgileRL/render_agilerl_maddpg.py | 36 +--
tutorials/AgileRL/render_agilerl_matd3.py | 35 +--
tutorials/AgileRL/requirements.txt | 3 +-
9 files changed, 585 insertions(+), 586 deletions(-)
diff --git a/docs/tutorials/agilerl/DQN.md b/docs/tutorials/agilerl/DQN.md
index 6a7cc3731..9cd9cb30a 100644
--- a/docs/tutorials/agilerl/DQN.md
+++ b/docs/tutorials/agilerl/DQN.md
@@ -62,7 +62,7 @@ Importing the following packages, functions and classes will enable us to run th
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
- from agilerl.utils.utils import initialPopulation
+ from agilerl.utils.utils import create_population
from tqdm import tqdm, trange
from pettingzoo.classic import connect_four_v3
@@ -167,27 +167,23 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
while not (done or truncation):
# Player 0's turn
p0_action_mask = observation["action_mask"]
- p0_state = np.moveaxis(observation["observation"], [-1], [-3])
- p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
- p0_state = np.expand_dims(p0_state, 0)
+ p0_state, p0_state_flipped = transform_and_flip(observation, player = 0)
if opponent_first:
p0_action = self.env.action_space("player_0").sample(p0_action_mask)
else:
if self.lesson["warm_up_opponent"] == "random":
- p0_action = opponent.getAction(
+ p0_action = opponent.get_action(
p0_action_mask, p1_action, self.lesson["block_vert_coef"]
)
else:
- p0_action = opponent.getAction(player=0)
+ p0_action = opponent.get_action(player=0)
self.step(p0_action) # Act in environment
observation, env_reward, done, truncation, _ = self.last()
- p0_next_state = np.moveaxis(observation["observation"], [-1], [-3])
- p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0)
- p0_next_state = np.expand_dims(p0_next_state, 0)
+ p0_next_state, p0_next_state_flipped = transform_and_flip(observation, player = 0)
if done or truncation:
reward = self.reward(done=True, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(p0_state, p1_state, p0_state_flipped, p1_state_flipped)
),
@@ -211,7 +207,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
else: # Play continues
if p1_state is not None:
reward = self.reward(done=False, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p1_state, p1_state_flipped)),
[p1_action, 6 - p1_action],
[reward, reward],
@@ -221,31 +217,25 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
# Player 1's turn
p1_action_mask = observation["action_mask"]
- p1_state = np.moveaxis(observation["observation"], [-1], [-3])
- p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
- p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
- p1_state = np.expand_dims(p1_state, 0)
+ p1_state, p1_state_flipped = transform_and_flip(observation, player = 1)
if not opponent_first:
p1_action = self.env.action_space("player_1").sample(
p1_action_mask
)
else:
if self.lesson["warm_up_opponent"] == "random":
- p1_action = opponent.getAction(
+ p1_action = opponent.get_action(
p1_action_mask, p0_action, LESSON["block_vert_coef"]
)
else:
- p1_action = opponent.getAction(player=1)
+ p1_action = opponent.get_action(player=1)
self.step(p1_action) # Act in environment
observation, env_reward, done, truncation, _ = self.last()
- p1_next_state = np.moveaxis(observation["observation"], [-1], [-3])
- p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
- p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0)
- p1_next_state = np.expand_dims(p1_next_state, 0)
+ p1_next_state, p1_next_state_flipped = transform_and_flip(observation, player = 1)
if done or truncation:
reward = self.reward(done=True, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(p0_state, p1_state, p0_state_flipped, p1_state_flipped)
),
@@ -269,7 +259,7 @@ To implement our curriculum, we create a ```CurriculumEnv``` class that acts as
else: # Play continues
reward = self.reward(done=False, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p0_state, p0_state_flipped)),
[p0_action, 6 - p0_action],
[reward, reward],
@@ -431,11 +421,11 @@ When defining the different lessons in our curriculum, we can increase the diffi
self.env = env.env
self.difficulty = difficulty
if self.difficulty == "random":
- self.getAction = self.random_opponent
+ self.get_action = self.random_opponent
elif self.difficulty == "weak":
- self.getAction = self.weak_rule_based_opponent
+ self.get_action = self.weak_rule_based_opponent
else:
- self.getAction = self.strong_rule_based_opponent
+ self.get_action = self.strong_rule_based_opponent
self.num_cols = 7
self.num_rows = 6
self.length = 4
@@ -612,10 +602,10 @@ Before we go any further in this tutorial, it would be helpful to define and set
# Define the network configuration
NET_CONFIG = {
"arch": "cnn", # Network architecture
- "h_size": [64, 64], # Actor hidden size
- "c_size": [128], # CNN channel size
- "k_size": [4], # CNN kernel size
- "s_size": [1], # CNN stride size
+ "hidden_size": [64, 64], # Actor hidden size
+ "channel_size": [128], # CNN channel size
+ "kernel_size": [4], # CNN kernel size
+ "stride_size": [1], # CNN stride size
"normalize": False, # Normalize image from range [0,255] to [0,1]
}
@@ -640,7 +630,6 @@ Before we go any further in this tutorial, it would be helpful to define and set
"NUM_ATOMS": 51, # Unit number of support
"V_MIN": 0.0, # Minimum value of support
"V_MAX": 200.0, # Maximum value of support
- "WANDB": False, # Use Weights and Biases tracking
}
# Define the connect four environment
@@ -667,7 +656,7 @@ Before we go any further in this tutorial, it would be helpful to define and set
action_dim = action_dim[0]
# Create a population ready for evolutionary hyper-parameter optimisation
- pop = initialPopulation(
+ pop = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
@@ -681,7 +670,6 @@ Before we go any further in this tutorial, it would be helpful to define and set
# Configure the replay buffer
field_names = ["state", "action", "reward", "next_state", "done"]
memory = ReplayBuffer(
- action_dim=action_dim, # Number of agent actions
memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size
field_names=field_names, # Field names to store in memory
device=device,
@@ -692,8 +680,8 @@ Before we go any further in this tutorial, it would be helpful to define and set
tournament_size=2, # Tournament selection size
elitism=True, # Elitism in tournament selection
population_size=INIT_HP["POPULATION_SIZE"], # Population size
- evo_step=1,
- ) # Evaluate using last N fitness scores
+ eval_loop=1, # Evaluate using last N fitness scores
+ )
# Instantiate a mutations object (used for HPO)
mutations = Mutations(
@@ -733,7 +721,6 @@ Before we go any further in this tutorial, it would be helpful to define and set
eps_end = 0.1 # Final epsilon value
eps_decay = 0.9998 # Epsilon decays
opp_update_counter = 0
- wb = INIT_HP["WANDB"]
```
@@ -745,6 +732,7 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran
```python
# Perform buffer and agent warmups if desired
+ # Perform buffer and agent warmups if desired
if LESSON["buffer_warm_up"]:
warm_up_opponent = Opponent(env, difficulty=LESSON["warm_up_opponent"])
memory = env.fill_replay_buffer(
@@ -763,6 +751,33 @@ As part of the curriculum, we may also choose to fill the replay buffer with ran
```
+The observation space of Connect Four is (6, 7, 2), where the first two dimensions represent the board and the third dimension represents the player. As PyTorch uses channels-first by default, we need to preprocess the observation. Moreover, we need to flip and swap the planes of the observation to account for the fact that the agent will play as both player 0 and player 1. We can define a function to do this as follows:
+
+
+ Tansform and Flip
+
+ ```python
+ def transform_and_flip(observation, player):
+ """Transforms and flips observation for input to agent's neural network.
+
+ :param observation: Observation to preprocess
+ :type observation: dict[str, np.ndarray]
+ :param player: Player, 0 or 1
+ :type player: int
+ """
+ state = observation["observation"]
+ # Pre-process dimensions for PyTorch (N, C, H, W)
+ state = np.moveaxis(state, [-1], [-3])
+ if player == 1:
+ # Swap pieces so that the agent always sees the board from the same perspective
+ state[[0, 1], :, :] = state[[1, 0], :, :]
+ state_flipped = np.expand_dims(np.flip(state, 2), 0)
+ state = np.expand_dims(state, 0)
+ return state, state_flipped
+ ```
+
+
+
### Self-play
In this tutorial, we use self-play as the final lesson in our curriculum. By iteratively improving our agent and making it learn to win against itself, we can allow it to discover new strategies and achieve higher performance. The weights of our pretrained agent from an earlier lesson can be loaded to the population as follows:
@@ -774,7 +789,7 @@ In this tutorial, we use self-play as the final lesson in our curriculum. By ite
if LESSON["pretrained_path"] is not None:
for agent in pop:
# Load pretrained checkpoint
- agent.loadCheckpoint(LESSON["pretrained_path"])
+ agent.load_checkpoint(LESSON["pretrained_path"])
# Reinit optimizer for new task
agent.lr = INIT_HP["LR"]
agent.optimizer = torch.optim.Adam(
@@ -824,24 +839,23 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
```python
if max_episodes > 0:
- if wb:
- wandb.init(
- # set the wandb project where this run will be logged
- project="AgileRL",
- name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
- "connect_four_v3",
- INIT_HP["ALGO"],
- LESSON["opponent"],
- datetime.now().strftime("%m%d%Y%H%M%S"),
- ),
- # track hyperparameters and run metadata
- config={
- "algo": "Evo HPO Rainbow DQN",
- "env": "connect_four_v3",
- "INIT_HP": INIT_HP,
- "lesson": LESSON,
- },
- )
+ wandb.init(
+ # set the wandb project where this run will be logged
+ project="AgileRL",
+ name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
+ "connect_four_v3",
+ INIT_HP["ALGO"],
+ LESSON["opponent"],
+ datetime.now().strftime("%m%d%Y%H%M%S"),
+ ),
+ # track hyperparameters and run metadata
+ config={
+ "algo": "Evo HPO Rainbow DQN",
+ "env": "connect_four_v3",
+ "INIT_HP": INIT_HP,
+ "lesson": LESSON,
+ },
+ )
total_steps = 0
total_episodes = 0
@@ -854,7 +868,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
for agent in pop: # Loop through population
for episode in range(episodes_per_epoch):
env.reset() # Reset environment at start of episode
- observation, env_reward, done, truncation, _ = env.last()
+ observation, cumulative_reward, done, truncation, _ = env.last()
(
p1_state,
@@ -883,23 +897,21 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
for idx_step in range(max_steps):
# Player 0"s turn
p0_action_mask = observation["action_mask"]
- p0_state = np.moveaxis(observation["observation"], [-1], [-3])
- p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
- p0_state = np.expand_dims(p0_state, 0)
+ p0_state, p0_state_flipped = transform_and_flip(observation, player = 0)
if opponent_first:
if LESSON["opponent"] == "self":
- p0_action = opponent.getAction(
+ p0_action = opponent.get_action(
p0_state, 0, p0_action_mask
)[0]
elif LESSON["opponent"] == "random":
- p0_action = opponent.getAction(
+ p0_action = opponent.get_action(
p0_action_mask, p1_action, LESSON["block_vert_coef"]
)
else:
- p0_action = opponent.getAction(player=0)
+ p0_action = opponent.get_action(player=0)
else:
- p0_action = agent.getAction(
+ p0_action = agent.get_action(
p0_state, epsilon, p0_action_mask
)[
0
@@ -907,23 +919,18 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
train_actions_hist[p0_action] += 1
env.step(p0_action) # Act in environment
- observation, env_reward, done, truncation, _ = env.last()
- p0_next_state = np.moveaxis(
- observation["observation"], [-1], [-3]
+ observation, cumulative_reward, done, truncation, _ = env.last()
+ p0_next_state, p0_next_state_flipped = transform_and_flip(
+ observation, player = 0
)
- p0_next_state_flipped = np.expand_dims(
- np.flip(p0_next_state, 2), 0
- )
- p0_next_state = np.expand_dims(p0_next_state, 0)
-
if not opponent_first:
- score += env_reward
+ score = cumulative_reward
turns += 1
# Check if game is over (Player 0 win)
if done or truncation:
reward = env.reward(done=True, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(
p0_state,
@@ -952,7 +959,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
else: # Play continues
if p1_state is not None:
reward = env.reward(done=False, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p1_state, p1_state_flipped)),
[p1_action, 6 - p1_action],
[reward, reward],
@@ -964,29 +971,23 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
# Player 1"s turn
p1_action_mask = observation["action_mask"]
- p1_state = np.moveaxis(
- observation["observation"], [-1], [-3]
- )
- # Swap pieces so that the agent always sees the board from the same perspective
- p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
- p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
- p1_state = np.expand_dims(p1_state, 0)
+ p1_state, p1_state_flipped = transform_and_flip(observation, player = 1)
if not opponent_first:
if LESSON["opponent"] == "self":
- p1_action = opponent.getAction(
+ p1_action = opponent.get_action(
p1_state, 0, p1_action_mask
)[0]
elif LESSON["opponent"] == "random":
- p1_action = opponent.getAction(
+ p1_action = opponent.get_action(
p1_action_mask,
p0_action,
LESSON["block_vert_coef"],
)
else:
- p1_action = opponent.getAction(player=1)
+ p1_action = opponent.get_action(player=1)
else:
- p1_action = agent.getAction(
+ p1_action = agent.get_action(
p1_state, epsilon, p1_action_mask
)[
0
@@ -994,24 +995,19 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
train_actions_hist[p1_action] += 1
env.step(p1_action) # Act in environment
- observation, env_reward, done, truncation, _ = env.last()
- p1_next_state = np.moveaxis(
- observation["observation"], [-1], [-3]
- )
- p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
- p1_next_state_flipped = np.expand_dims(
- np.flip(p1_next_state, 2), 0
+ observation, cumulative_reward, done, truncation, _ = env.last()
+ p1_next_state, p1_next_state_flipped = transform_and_flip(
+ observation, player = 1
)
- p1_next_state = np.expand_dims(p1_next_state, 0)
if opponent_first:
- score += env_reward
+ score = cumulative_reward
turns += 1
# Check if game is over (Player 1 win)
if done or truncation:
reward = env.reward(done=True, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(
p0_state,
@@ -1045,7 +1041,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
else: # Play continues
reward = env.reward(done=False, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p0_state, p0_state_flipped)),
[p0_action, 6 - p0_action],
[reward, reward],
@@ -1100,7 +1096,7 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
rewards = []
for i in range(evo_loop):
env.reset() # Reset environment at start of episode
- observation, reward, done, truncation, _ = env.last()
+ observation, cumulative_reward, done, truncation, _ = env.last()
player = -1 # Tracker for which player"s turn it is
@@ -1120,42 +1116,42 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
if player < 0:
if opponent_first:
if LESSON["eval_opponent"] == "random":
- action = opponent.getAction(action_mask)
+ action = opponent.get_action(action_mask)
else:
- action = opponent.getAction(player=0)
+ action = opponent.get_action(player=0)
else:
state = np.moveaxis(
observation["observation"], [-1], [-3]
)
state = np.expand_dims(state, 0)
- action = agent.getAction(state, 0, action_mask)[
+ action = agent.get_action(state, 0, action_mask)[
0
] # Get next action from agent
eval_actions_hist[action] += 1
if player > 0:
if not opponent_first:
if LESSON["eval_opponent"] == "random":
- action = opponent.getAction(action_mask)
+ action = opponent.get_action(action_mask)
else:
- action = opponent.getAction(player=1)
+ action = opponent.get_action(player=1)
else:
state = np.moveaxis(
observation["observation"], [-1], [-3]
)
- state[[0, 1], :, :] = state[[0, 1], :, :]
+ state[[0, 1], :, :] = state[[1, 0], :, :]
state = np.expand_dims(state, 0)
- action = agent.getAction(state, 0, action_mask)[
+ action = agent.get_action(state, 0, action_mask)[
0
] # Get next action from agent
eval_actions_hist[action] += 1
env.step(action) # Act in environment
- observation, reward, done, truncation, _ = env.last()
+ observation, cumulative_reward, done, truncation, _ = env.last()
if (player > 0 and opponent_first) or (
player < 0 and not opponent_first
):
- score += reward
+ score = cumulative_reward
eval_turns += 1
@@ -1192,34 +1188,34 @@ At regular intervals, we evaluate the performance, or 'fitness', of the agents
for index, action in enumerate(eval_actions_hist)
}
- if wb:
- wandb_dict = {
- "global_step": total_steps,
- "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
- "train/mean_turns_per_game": mean_turns,
- "train/epsilon": epsilon,
- "train/opponent_updates": opp_update_counter,
- "eval/mean_fitness": np.mean(fitnesses),
- "eval/best_fitness": np.max(fitnesses),
- "eval/mean_turns_per_game": eval_turns,
- }
- wandb_dict.update(train_actions_dict)
- wandb_dict.update(eval_actions_dict)
- wandb.log(wandb_dict)
+ wandb_dict = {
+ "global_step": total_steps,
+ "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
+ "train/mean_turns_per_game": mean_turns,
+ "train/epsilon": epsilon,
+ "train/opponent_updates": opp_update_counter,
+ "eval/mean_fitness": np.mean(fitnesses),
+ "eval/best_fitness": np.max(fitnesses),
+ "eval/mean_turns_per_game": eval_turns,
+ }
+ wandb_dict.update(train_actions_dict)
+ wandb_dict.update(eval_actions_dict)
+ wandb.log(wandb_dict)
# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)
if max_episodes > 0:
- if wb:
- wandb.finish()
+ wandb.finish()
# Save the trained agent
save_path = LESSON["save_path"]
os.makedirs(os.path.dirname(save_path), exist_ok=True)
- elite.saveCheckpoint(save_path)
+ elite.save_checkpoint(save_path)
print(f"Elite agent saved to '{save_path}'.")
+
+ pbar.close()
```
diff --git a/docs/tutorials/agilerl/MADDPG.md b/docs/tutorials/agilerl/MADDPG.md
index bc6c52e8b..7052b8b1a 100644
--- a/docs/tutorials/agilerl/MADDPG.md
+++ b/docs/tutorials/agilerl/MADDPG.md
@@ -21,7 +21,7 @@ To follow this tutorial, you will need to install the dependencies shown below.
```
## Code
-### Train multiple agents using MADDPG
+### Train agents using MADDPG
The following code should run without any issues. The comments are designed to help you understand how to use PettingZoo with AgileRL. If you have any questions, please feel free to ask in the [Discord server](https://discord.com/invite/eB8HyTA2ux).
```{eval-rst}
diff --git a/tutorials/AgileRL/agilerl_dqn_curriculum.py b/tutorials/AgileRL/agilerl_dqn_curriculum.py
index 1b6e86949..6b9ec9770 100644
--- a/tutorials/AgileRL/agilerl_dqn_curriculum.py
+++ b/tutorials/AgileRL/agilerl_dqn_curriculum.py
@@ -2,6 +2,7 @@
Author: Nick (https://github.com/nicku-a)
"""
+
import copy
import os
import random
@@ -15,7 +16,7 @@
from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
from tqdm import tqdm, trange
from pettingzoo.classic import connect_four_v3
@@ -66,27 +67,25 @@ def fill_replay_buffer(self, memory, opponent):
while not (done or truncation):
# Player 0's turn
p0_action_mask = observation["action_mask"]
- p0_state = np.moveaxis(observation["observation"], [-1], [-3])
- p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
- p0_state = np.expand_dims(p0_state, 0)
+ p0_state, p0_state_flipped = transform_and_flip(observation, player=0)
if opponent_first:
p0_action = self.env.action_space("player_0").sample(p0_action_mask)
else:
if self.lesson["warm_up_opponent"] == "random":
- p0_action = opponent.getAction(
+ p0_action = opponent.get_action(
p0_action_mask, p1_action, self.lesson["block_vert_coef"]
)
else:
- p0_action = opponent.getAction(player=0)
+ p0_action = opponent.get_action(player=0)
self.step(p0_action) # Act in environment
observation, env_reward, done, truncation, _ = self.last()
- p0_next_state = np.moveaxis(observation["observation"], [-1], [-3])
- p0_next_state_flipped = np.expand_dims(np.flip(p0_next_state, 2), 0)
- p0_next_state = np.expand_dims(p0_next_state, 0)
+ p0_next_state, p0_next_state_flipped = transform_and_flip(
+ observation, player=0
+ )
if done or truncation:
reward = self.reward(done=True, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(p0_state, p1_state, p0_state_flipped, p1_state_flipped)
),
@@ -110,7 +109,7 @@ def fill_replay_buffer(self, memory, opponent):
else: # Play continues
if p1_state is not None:
reward = self.reward(done=False, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p1_state, p1_state_flipped)),
[p1_action, 6 - p1_action],
[reward, reward],
@@ -120,31 +119,29 @@ def fill_replay_buffer(self, memory, opponent):
# Player 1's turn
p1_action_mask = observation["action_mask"]
- p1_state = np.moveaxis(observation["observation"], [-1], [-3])
- p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
- p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
- p1_state = np.expand_dims(p1_state, 0)
+ p1_state, p1_state_flipped = transform_and_flip(
+ observation, player=1
+ )
if not opponent_first:
p1_action = self.env.action_space("player_1").sample(
p1_action_mask
)
else:
if self.lesson["warm_up_opponent"] == "random":
- p1_action = opponent.getAction(
+ p1_action = opponent.get_action(
p1_action_mask, p0_action, LESSON["block_vert_coef"]
)
else:
- p1_action = opponent.getAction(player=1)
+ p1_action = opponent.get_action(player=1)
self.step(p1_action) # Act in environment
observation, env_reward, done, truncation, _ = self.last()
- p1_next_state = np.moveaxis(observation["observation"], [-1], [-3])
- p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
- p1_next_state_flipped = np.expand_dims(np.flip(p1_next_state, 2), 0)
- p1_next_state = np.expand_dims(p1_next_state, 0)
+ p1_next_state, p1_next_state_flipped = transform_and_flip(
+ observation, player=1
+ )
if done or truncation:
reward = self.reward(done=True, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(p0_state, p1_state, p0_state_flipped, p1_state_flipped)
),
@@ -168,7 +165,7 @@ def fill_replay_buffer(self, memory, opponent):
else: # Play continues
reward = self.reward(done=False, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p0_state, p0_state_flipped)),
[p0_action, 6 - p0_action],
[reward, reward],
@@ -323,11 +320,11 @@ def __init__(self, env, difficulty):
self.env = env.env
self.difficulty = difficulty
if self.difficulty == "random":
- self.getAction = self.random_opponent
+ self.get_action = self.random_opponent
elif self.difficulty == "weak":
- self.getAction = self.weak_rule_based_opponent
+ self.get_action = self.weak_rule_based_opponent
else:
- self.getAction = self.strong_rule_based_opponent
+ self.get_action = self.strong_rule_based_opponent
self.num_cols = 7
self.num_rows = 6
self.length = 4
@@ -482,6 +479,25 @@ def outcome(self, action, player, return_length=False):
return (True, reward, ended) + ((lengths,) if return_length else ())
+def transform_and_flip(observation, player):
+ """Transforms and flips observation for input to agent's neural network.
+
+ :param observation: Observation to preprocess
+ :type observation: dict[str, np.ndarray]
+ :param player: Player, 0 or 1
+ :type player: int
+ """
+ state = observation["observation"]
+ # Pre-process dimensions for PyTorch (N, C, H, W)
+ state = np.moveaxis(state, [-1], [-3])
+ if player == 1:
+ # Swap pieces so that the agent always sees the board from the same perspective
+ state[[0, 1], :, :] = state[[1, 0], :, :]
+ state_flipped = np.expand_dims(np.flip(state, 2), 0)
+ state = np.expand_dims(state, 0)
+ return state, state_flipped
+
+
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("===== AgileRL Curriculum Learning Demo =====")
@@ -494,10 +510,10 @@ def outcome(self, action, player, return_length=False):
# Define the network configuration
NET_CONFIG = {
"arch": "cnn", # Network architecture
- "h_size": [64, 64], # Actor hidden size
- "c_size": [128], # CNN channel size
- "k_size": [4], # CNN kernel size
- "s_size": [1], # CNN stride size
+ "hidden_size": [64, 64], # Actor hidden size
+ "channel_size": [128], # CNN channel size
+ "kernel_size": [4], # CNN kernel size
+ "stride_size": [1], # CNN stride size
"normalize": False, # Normalize image from range [0,255] to [0,1]
}
@@ -522,7 +538,6 @@ def outcome(self, action, player, return_length=False):
"NUM_ATOMS": 51, # Unit number of support
"V_MIN": 0.0, # Minimum value of support
"V_MAX": 200.0, # Maximum value of support
- "WANDB": False, # Use Weights and Biases tracking
}
# Define the connect four environment
@@ -549,7 +564,7 @@ def outcome(self, action, player, return_length=False):
action_dim = action_dim[0]
# Create a population ready for evolutionary hyper-parameter optimisation
- pop = initialPopulation(
+ pop = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
@@ -563,7 +578,6 @@ def outcome(self, action, player, return_length=False):
# Configure the replay buffer
field_names = ["state", "action", "reward", "next_state", "done"]
memory = ReplayBuffer(
- action_dim=action_dim, # Number of agent actions
memory_size=INIT_HP["MEMORY_SIZE"], # Max replay buffer size
field_names=field_names, # Field names to store in memory
device=device,
@@ -574,8 +588,8 @@ def outcome(self, action, player, return_length=False):
tournament_size=2, # Tournament selection size
elitism=True, # Elitism in tournament selection
population_size=INIT_HP["POPULATION_SIZE"], # Population size
- evo_step=1,
- ) # Evaluate using last N fitness scores
+ eval_loop=1, # Evaluate using last N fitness scores
+ )
# Instantiate a mutations object (used for HPO)
mutations = Mutations(
@@ -606,12 +620,7 @@ def outcome(self, action, player, return_length=False):
# Define training loop parameters
episodes_per_epoch = 10
-
- # ! NOTE: Uncomment the max_episodes line below to change the number of training episodes. ! #
- # It is deliberately set low to allow testing to ensure this tutorial is sound.
- max_episodes = 10
- # max_episodes = LESSON["max_train_episodes"] # Total episodes
-
+ max_episodes = LESSON["max_train_episodes"] # Total episodes
max_steps = 500 # Maximum steps to take in each episode
evo_epochs = 20 # Evolution frequency
evo_loop = 50 # Number of evaluation episodes
@@ -620,12 +629,11 @@ def outcome(self, action, player, return_length=False):
eps_end = 0.1 # Final epsilon value
eps_decay = 0.9998 # Epsilon decays
opp_update_counter = 0
- wb = INIT_HP["WANDB"]
if LESSON["pretrained_path"] is not None:
for agent in pop:
# Load pretrained checkpoint
- agent.loadCheckpoint(LESSON["pretrained_path"])
+ agent.load_checkpoint(LESSON["pretrained_path"])
# Reinit optimizer for new task
agent.lr = INIT_HP["LR"]
agent.optimizer = torch.optim.Adam(
@@ -659,24 +667,23 @@ def outcome(self, action, player, return_length=False):
print("Agent population warmed up.")
if max_episodes > 0:
- if wb:
- wandb.init(
- # set the wandb project where this run will be logged
- project="AgileRL",
- name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
- "connect_four_v3",
- INIT_HP["ALGO"],
- LESSON["opponent"],
- datetime.now().strftime("%m%d%Y%H%M%S"),
- ),
- # track hyperparameters and run metadata
- config={
- "algo": "Evo HPO Rainbow DQN",
- "env": "connect_four_v3",
- "INIT_HP": INIT_HP,
- "lesson": LESSON,
- },
- )
+ wandb.init(
+ # set the wandb project where this run will be logged
+ project="AgileRL",
+ name="{}-EvoHPO-{}-{}Opposition-CNN-{}".format(
+ "connect_four_v3",
+ INIT_HP["ALGO"],
+ LESSON["opponent"],
+ datetime.now().strftime("%m%d%Y%H%M%S"),
+ ),
+ # track hyperparameters and run metadata
+ config={
+ "algo": "Evo HPO Rainbow DQN",
+ "env": "connect_four_v3",
+ "INIT_HP": INIT_HP,
+ "lesson": LESSON,
+ },
+ )
total_steps = 0
total_episodes = 0
@@ -689,7 +696,7 @@ def outcome(self, action, player, return_length=False):
for agent in pop: # Loop through population
for episode in range(episodes_per_epoch):
env.reset() # Reset environment at start of episode
- observation, env_reward, done, truncation, _ = env.last()
+ observation, cumulative_reward, done, truncation, _ = env.last()
(
p1_state,
@@ -718,23 +725,23 @@ def outcome(self, action, player, return_length=False):
for idx_step in range(max_steps):
# Player 0"s turn
p0_action_mask = observation["action_mask"]
- p0_state = np.moveaxis(observation["observation"], [-1], [-3])
- p0_state_flipped = np.expand_dims(np.flip(p0_state, 2), 0)
- p0_state = np.expand_dims(p0_state, 0)
+ p0_state, p0_state_flipped = transform_and_flip(
+ observation, player=0
+ )
if opponent_first:
if LESSON["opponent"] == "self":
- p0_action = opponent.getAction(
+ p0_action = opponent.get_action(
p0_state, 0, p0_action_mask
)[0]
elif LESSON["opponent"] == "random":
- p0_action = opponent.getAction(
+ p0_action = opponent.get_action(
p0_action_mask, p1_action, LESSON["block_vert_coef"]
)
else:
- p0_action = opponent.getAction(player=0)
+ p0_action = opponent.get_action(player=0)
else:
- p0_action = agent.getAction(
+ p0_action = agent.get_action(
p0_state, epsilon, p0_action_mask
)[
0
@@ -742,23 +749,18 @@ def outcome(self, action, player, return_length=False):
train_actions_hist[p0_action] += 1
env.step(p0_action) # Act in environment
- observation, env_reward, done, truncation, _ = env.last()
- p0_next_state = np.moveaxis(
- observation["observation"], [-1], [-3]
+ observation, cumulative_reward, done, truncation, _ = env.last()
+ p0_next_state, p0_next_state_flipped = transform_and_flip(
+ observation, player=0
)
- p0_next_state_flipped = np.expand_dims(
- np.flip(p0_next_state, 2), 0
- )
- p0_next_state = np.expand_dims(p0_next_state, 0)
-
if not opponent_first:
- score += env_reward
+ score = cumulative_reward
turns += 1
# Check if game is over (Player 0 win)
if done or truncation:
reward = env.reward(done=True, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(
p0_state,
@@ -787,7 +789,7 @@ def outcome(self, action, player, return_length=False):
else: # Play continues
if p1_state is not None:
reward = env.reward(done=False, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p1_state, p1_state_flipped)),
[p1_action, 6 - p1_action],
[reward, reward],
@@ -799,29 +801,25 @@ def outcome(self, action, player, return_length=False):
# Player 1"s turn
p1_action_mask = observation["action_mask"]
- p1_state = np.moveaxis(
- observation["observation"], [-1], [-3]
+ p1_state, p1_state_flipped = transform_and_flip(
+ observation, player=1
)
- # Swap pieces so that the agent always sees the board from the same perspective
- p1_state[[0, 1], :, :] = p1_state[[0, 1], :, :]
- p1_state_flipped = np.expand_dims(np.flip(p1_state, 2), 0)
- p1_state = np.expand_dims(p1_state, 0)
if not opponent_first:
if LESSON["opponent"] == "self":
- p1_action = opponent.getAction(
+ p1_action = opponent.get_action(
p1_state, 0, p1_action_mask
)[0]
elif LESSON["opponent"] == "random":
- p1_action = opponent.getAction(
+ p1_action = opponent.get_action(
p1_action_mask,
p0_action,
LESSON["block_vert_coef"],
)
else:
- p1_action = opponent.getAction(player=1)
+ p1_action = opponent.get_action(player=1)
else:
- p1_action = agent.getAction(
+ p1_action = agent.get_action(
p1_state, epsilon, p1_action_mask
)[
0
@@ -829,24 +827,25 @@ def outcome(self, action, player, return_length=False):
train_actions_hist[p1_action] += 1
env.step(p1_action) # Act in environment
- observation, env_reward, done, truncation, _ = env.last()
- p1_next_state = np.moveaxis(
- observation["observation"], [-1], [-3]
- )
- p1_next_state[[0, 1], :, :] = p1_next_state[[0, 1], :, :]
- p1_next_state_flipped = np.expand_dims(
- np.flip(p1_next_state, 2), 0
+ (
+ observation,
+ cumulative_reward,
+ done,
+ truncation,
+ _,
+ ) = env.last()
+ p1_next_state, p1_next_state_flipped = transform_and_flip(
+ observation, player=1
)
- p1_next_state = np.expand_dims(p1_next_state, 0)
if opponent_first:
- score += env_reward
+ score = cumulative_reward
turns += 1
# Check if game is over (Player 1 win)
if done or truncation:
reward = env.reward(done=True, player=1)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate(
(
p0_state,
@@ -880,7 +879,7 @@ def outcome(self, action, player, return_length=False):
else: # Play continues
reward = env.reward(done=False, player=0)
- memory.save2memoryVectEnvs(
+ memory.save_to_memory_vect_envs(
np.concatenate((p0_state, p0_state_flipped)),
[p0_action, 6 - p0_action],
[reward, reward],
@@ -935,7 +934,13 @@ def outcome(self, action, player, return_length=False):
rewards = []
for i in range(evo_loop):
env.reset() # Reset environment at start of episode
- observation, reward, done, truncation, _ = env.last()
+ (
+ observation,
+ cumulative_reward,
+ done,
+ truncation,
+ _,
+ ) = env.last()
player = -1 # Tracker for which player"s turn it is
@@ -955,42 +960,52 @@ def outcome(self, action, player, return_length=False):
if player < 0:
if opponent_first:
if LESSON["eval_opponent"] == "random":
- action = opponent.getAction(action_mask)
+ action = opponent.get_action(action_mask)
else:
- action = opponent.getAction(player=0)
+ action = opponent.get_action(player=0)
else:
state = np.moveaxis(
observation["observation"], [-1], [-3]
)
state = np.expand_dims(state, 0)
- action = agent.getAction(state, 0, action_mask)[
+ action = agent.get_action(
+ state, 0, action_mask
+ )[
0
] # Get next action from agent
eval_actions_hist[action] += 1
if player > 0:
if not opponent_first:
if LESSON["eval_opponent"] == "random":
- action = opponent.getAction(action_mask)
+ action = opponent.get_action(action_mask)
else:
- action = opponent.getAction(player=1)
+ action = opponent.get_action(player=1)
else:
state = np.moveaxis(
observation["observation"], [-1], [-3]
)
- state[[0, 1], :, :] = state[[0, 1], :, :]
+ state[[0, 1], :, :] = state[[1, 0], :, :]
state = np.expand_dims(state, 0)
- action = agent.getAction(state, 0, action_mask)[
+ action = agent.get_action(
+ state, 0, action_mask
+ )[
0
] # Get next action from agent
eval_actions_hist[action] += 1
env.step(action) # Act in environment
- observation, reward, done, truncation, _ = env.last()
+ (
+ observation,
+ cumulative_reward,
+ done,
+ truncation,
+ _,
+ ) = env.last()
if (player > 0 and opponent_first) or (
player < 0 and not opponent_first
):
- score += reward
+ score = cumulative_reward
eval_turns += 1
@@ -1027,31 +1042,29 @@ def outcome(self, action, player, return_length=False):
for index, action in enumerate(eval_actions_hist)
}
- if wb:
- wandb_dict = {
- "global_step": total_steps,
- "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
- "train/mean_turns_per_game": mean_turns,
- "train/epsilon": epsilon,
- "train/opponent_updates": opp_update_counter,
- "eval/mean_fitness": np.mean(fitnesses),
- "eval/best_fitness": np.max(fitnesses),
- "eval/mean_turns_per_game": eval_turns,
- }
- wandb_dict.update(train_actions_dict)
- wandb_dict.update(eval_actions_dict)
- wandb.log(wandb_dict)
+ wandb_dict = {
+ "global_step": total_steps,
+ "train/mean_score": np.mean(agent.scores[-episodes_per_epoch:]),
+ "train/mean_turns_per_game": mean_turns,
+ "train/epsilon": epsilon,
+ "train/opponent_updates": opp_update_counter,
+ "eval/mean_fitness": np.mean(fitnesses),
+ "eval/best_fitness": np.max(fitnesses),
+ "eval/mean_turns_per_game": eval_turns,
+ }
+ wandb_dict.update(train_actions_dict)
+ wandb_dict.update(eval_actions_dict)
+ wandb.log(wandb_dict)
# Tournament selection and population mutation
elite, pop = tournament.select(pop)
pop = mutations.mutation(pop)
if max_episodes > 0:
- if wb:
- wandb.finish()
+ wandb.finish()
# Save the trained agent
save_path = LESSON["save_path"]
os.makedirs(os.path.dirname(save_path), exist_ok=True)
- elite.saveCheckpoint(save_path)
+ elite.save_checkpoint(save_path)
print(f"Elite agent saved to '{save_path}'.")
diff --git a/tutorials/AgileRL/agilerl_maddpg.py b/tutorials/AgileRL/agilerl_maddpg.py
index 14a93f6e1..99d19e17c 100644
--- a/tutorials/AgileRL/agilerl_maddpg.py
+++ b/tutorials/AgileRL/agilerl_maddpg.py
@@ -2,76 +2,88 @@
Authors: Michael (https://github.com/mikepratt1), Nick (https://github.com/nicku-a)
"""
+
import os
+from copy import deepcopy
import numpy as np
import supersuit as ss
import torch
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
-from agilerl.hpo.mutation import Mutations
-from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
+from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
from tqdm import trange
from pettingzoo.atari import space_invaders_v2
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- print("===== AgileRL MADDPG Demo =====")
# Define the network configuration
NET_CONFIG = {
"arch": "cnn", # Network architecture
- "h_size": [32, 32], # Network hidden size
- "c_size": [32, 32], # CNN channel size
- "k_size": [3, 3], # CNN kernel size
- "s_size": [2, 2], # CNN stride size
+ "hidden_size": [32, 32], # Network hidden size
+ "channel_size": [32, 32], # CNN channel size
+ "kernel_size": [3, 3], # CNN kernel size
+ "stride_size": [2, 2], # CNN stride size
"normalize": True, # Normalize image from range [0,255] to [0,1]
}
# Define the initial hyperparameters
INIT_HP = {
- "POPULATION_SIZE": 2,
+ "POPULATION_SIZE": 1,
"ALGO": "MADDPG", # Algorithm
# Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
"CHANNELS_LAST": True,
- "BATCH_SIZE": 8, # Batch size
+ "BATCH_SIZE": 32, # Batch size
+ "O_U_NOISE": True, # Ornstein Uhlenbeck action noise
+ "EXPL_NOISE": 0.1, # Action noise scale
+ "MEAN_NOISE": 0.0, # Mean action noise
+ "THETA": 0.15, # Rate of mean reversion in OU noise
+ "DT": 0.01, # Timestep for OU noise
"LR_ACTOR": 0.001, # Actor learning rate
- "LR_CRITIC": 0.01, # Critic learning rate
+ "LR_CRITIC": 0.001, # Critic learning rate
"GAMMA": 0.95, # Discount factor
- "MEMORY_SIZE": 10000, # Max memory buffer size
- "LEARN_STEP": 5, # Learning frequency
+ "MEMORY_SIZE": 100000, # Max memory buffer size
+ "LEARN_STEP": 100, # Learning frequency
"TAU": 0.01, # For soft update of target parameters
}
+ num_envs = 8
# Define the space invaders environment as a parallel environment
env = space_invaders_v2.parallel_env()
- if INIT_HP["CHANNELS_LAST"]:
- # Environment processing for image based observations
- env = ss.frame_skip_v0(env, 4)
- env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1)
- env = ss.color_reduction_v0(env, mode="B")
- env = ss.resize_v1(env, x_size=84, y_size=84)
- env = ss.frame_stack_v1(env, 4)
+
+ # Environment processing for image based observations
+ env = ss.frame_skip_v0(env, 4)
+ env = ss.clip_reward_v0(env, lower_bound=-1, upper_bound=1)
+ env = ss.color_reduction_v0(env, mode="B")
+ env = ss.resize_v1(env, x_size=84, y_size=84)
+ env = ss.frame_stack_v1(env, 4)
+ env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
+
env.reset()
# Configure the multi-agent algo input arguments
try:
- state_dim = [env.observation_space(agent).n for agent in env.agents]
+ state_dim = [env.single_observation_space(agent).n for agent in env.agents]
one_hot = True
except Exception:
- state_dim = [env.observation_space(agent).shape for agent in env.agents]
+ state_dim = [env.single_observation_space(agent).shape for agent in env.agents]
one_hot = False
try:
- action_dim = [env.action_space(agent).n for agent in env.agents]
+ action_dim = [env.single_action_space(agent).n for agent in env.agents]
INIT_HP["DISCRETE_ACTIONS"] = True
INIT_HP["MAX_ACTION"] = None
INIT_HP["MIN_ACTION"] = None
except Exception:
- action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
+ action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents]
INIT_HP["DISCRETE_ACTIONS"] = False
- INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
- INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
+ INIT_HP["MAX_ACTION"] = [
+ env.single_action_space(agent).high for agent in env.agents
+ ]
+ INIT_HP["MIN_ACTION"] = [
+ env.single_action_space(agent).low for agent in env.agents
+ ]
# Pre-process image dimensions for pytorch convolutional layers
if INIT_HP["CHANNELS_LAST"]:
@@ -84,7 +96,7 @@
INIT_HP["AGENT_IDS"] = env.agents
# Create a population ready for evolutionary hyper-parameter optimisation
- pop = initialPopulation(
+ agent = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
@@ -92,8 +104,9 @@
NET_CONFIG,
INIT_HP,
population_size=INIT_HP["POPULATION_SIZE"],
+ num_envs=num_envs,
device=device,
- )
+ )[0]
# Configure the multi-agent replay buffer
field_names = ["state", "action", "reward", "next_state", "done"]
@@ -104,152 +117,138 @@
device=device,
)
- # Instantiate a tournament selection object (used for HPO)
- tournament = TournamentSelection(
- tournament_size=2, # Tournament selection size
- elitism=True, # Elitism in tournament selection
- population_size=INIT_HP["POPULATION_SIZE"], # Population size
- evo_step=1,
- ) # Evaluate using last N fitness scores
-
- # Instantiate a mutations object (used for HPO)
- mutations = Mutations(
- algo=INIT_HP["ALGO"],
- no_mutation=0.2, # Probability of no mutation
- architecture=0.2, # Probability of architecture mutation
- new_layer_prob=0.2, # Probability of new layer mutation
- parameters=0.2, # Probability of parameter mutation
- activation=0, # Probability of activation function mutation
- rl_hp=0.2, # Probability of RL hyperparameter mutation
- rl_hp_selection=[
- "lr",
- "learn_step",
- "batch_size",
- ], # RL hyperparams selected for mutation
- mutation_sd=0.1, # Mutation strength
- # Define search space for each hyperparameter
- min_lr=0.0001,
- max_lr=0.01,
- min_learn_step=1,
- max_learn_step=120,
- min_batch_size=8,
- max_batch_size=64,
- agent_ids=INIT_HP["AGENT_IDS"], # Agent IDs
- arch=NET_CONFIG["arch"], # MLP or CNN
- rand_seed=1,
- device=device,
- )
-
# Define training loop parameters
- max_episodes = 5 # Total episodes (default: 6000)
- max_steps = 900 # Maximum steps to take in each episode
- epsilon = 1.0 # Starting epsilon value
- eps_end = 0.1 # Final epsilon value
- eps_decay = 0.995 # Epsilon decay
- evo_epochs = 20 # Evolution frequency
- evo_loop = 1 # Number of evaluation episodes
- elite = pop[0] # Assign a placeholder "elite" agent
-
- # Training loop
- for idx_epi in trange(max_episodes):
- for agent in pop: # Loop through population
- state, info = env.reset() # Reset environment at start of episode
- agent_reward = {agent_id: 0 for agent_id in env.agents}
+ agent_ids = deepcopy(env.agents)
+ max_steps = 20000 # Max steps (default: 2000000)
+ learning_delay = 500 # Steps before starting learning
+ training_steps = 10000 # Frequency at which we evaluate training score
+ eval_steps = None # Evaluation steps per episode - go until done
+ eval_loop = 1 # Number of evaluation episodes
+
+ total_steps = 0
+
+ # TRAINING LOOP
+ print("Training...")
+ pbar = trange(max_steps, unit="step")
+ while np.less(agent.steps[-1], max_steps):
+ state, info = env.reset() # Reset environment at start of episode
+ scores = np.zeros((num_envs, len(agent_ids)))
+ completed_episode_scores = []
+ steps = 0
+ if INIT_HP["CHANNELS_LAST"]:
+ state = {
+ agent_id: np.moveaxis(s, [-1], [-3]) for agent_id, s in state.items()
+ }
+
+ for idx_step in range(training_steps // num_envs):
+ # Get next action from agent
+ cont_actions, discrete_action = agent.get_action(
+ states=state, training=True, infos=info
+ )
+ if agent.discrete_actions:
+ action = discrete_action
+ else:
+ action = cont_actions
+
+ # Act in environment
+ action = {agent: env.action_space(agent).sample() for agent in env.agents}
+ next_state, reward, termination, truncation, info = env.step(action)
+ if not termination:
+ assert False
+ scores += np.array(list(reward.values())).transpose()
+ total_steps += num_envs
+ steps += num_envs
+
+ # Image processing if necessary for the environment
if INIT_HP["CHANNELS_LAST"]:
- state = {
- agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
- for agent_id, s in state.items()
+ next_state = {
+ agent_id: np.moveaxis(ns, [-1], [-3])
+ for agent_id, ns in next_state.items()
}
- for _ in range(max_steps):
- agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
- env_defined_actions = (
- info["env_defined_actions"]
- if "env_defined_actions" in info.keys()
- else None
- )
-
- # Get next action from agent
- cont_actions, discrete_action = agent.getAction(
- state, epsilon, agent_mask, env_defined_actions
- )
- if agent.discrete_actions:
- action = discrete_action
- else:
- action = cont_actions
-
- next_state, reward, termination, truncation, info = env.step(
- action
- ) # Act in environment
-
- # Image processing if necessary for the environment
- if INIT_HP["CHANNELS_LAST"]:
- state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
- next_state = {
- agent_id: np.moveaxis(ns, [-1], [-3])
- for agent_id, ns in next_state.items()
- }
-
- # Save experiences to replay buffer
- memory.save2memory(state, cont_actions, reward, next_state, termination)
-
- # Collect the reward
- for agent_id, r in reward.items():
- agent_reward[agent_id] += r
-
- # Learn according to learning frequency
- if (memory.counter % agent.learn_step == 0) and (
- len(memory) >= agent.batch_size
+
+ # Save experiences to replay buffer
+ memory.save_to_memory(
+ state,
+ cont_actions,
+ reward,
+ next_state,
+ termination,
+ is_vectorised=True,
+ )
+
+ # Learn according to learning frequency
+ # Handle learn steps > num_envs
+ if agent.learn_step > num_envs:
+ learn_step = agent.learn_step // num_envs
+ if (
+ idx_step % learn_step == 0
+ and len(memory) >= agent.batch_size
+ and memory.counter > learning_delay
):
- experiences = memory.sample(
- agent.batch_size
- ) # Sample replay buffer
- agent.learn(experiences) # Learn according to agent's RL algorithm
-
- # Update the state
- if INIT_HP["CHANNELS_LAST"]:
- next_state = {
- agent_id: np.expand_dims(ns, 0)
- for agent_id, ns in next_state.items()
- }
- state = next_state
-
- # Stop episode if any agents have terminated
- if any(truncation.values()) or any(termination.values()):
- break
-
- # Save the total episode reward
- score = sum(agent_reward.values())
- agent.scores.append(score)
-
- # Update epsilon for exploration
- epsilon = max(eps_end, epsilon * eps_decay)
-
- # Now evolve population if necessary
- if (idx_epi + 1) % evo_epochs == 0:
- # Evaluate population
- fitnesses = [
- agent.test(
- env,
- swap_channels=INIT_HP["CHANNELS_LAST"],
- max_steps=max_steps,
- loop=evo_loop,
- )
- for agent in pop
- ]
-
- print(f"Episode {idx_epi + 1}/{max_episodes}")
- print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
+ # Sample replay buffer
+ experiences = memory.sample(agent.batch_size)
+ # Learn according to agent's RL algorithm
+ agent.learn(experiences)
+ # Handle num_envs > learn step; learn multiple times per step in env
+ elif len(memory) >= agent.batch_size and memory.counter > learning_delay:
+ for _ in range(num_envs // agent.learn_step):
+ # Sample replay buffer
+ experiences = memory.sample(agent.batch_size)
+ # Learn according to agent's RL algorithm
+ agent.learn(experiences)
+
+ state = next_state
+
+ # Calculate scores and reset noise for finished episodes
+ reset_noise_indices = []
+ term_array = np.array(list(termination.values())).transpose()
+ trunc_array = np.array(list(truncation.values())).transpose()
+ for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+ if np.any(d) or np.any(t):
+ completed_episode_scores.append(scores[idx])
+ agent.scores.append(scores[idx])
+ scores[idx] = 0
+ reset_noise_indices.append(idx)
+ agent.reset_action_noise(reset_noise_indices)
+
+ pbar.update(training_steps)
+
+ agent.steps[-1] += steps
+
+ # Evaluate population
+ fitness = agent.test(
+ env,
+ swap_channels=INIT_HP["CHANNELS_LAST"],
+ max_steps=eval_steps,
+ loop=eval_loop,
+ sum_scores=False,
+ )
+ pop_episode_scores = np.array(completed_episode_scores)
+ mean_scores = np.mean(pop_episode_scores, axis=0)
+
+ print(f"--- Global steps {total_steps} ---")
+ print(f"Steps {agent.steps[-1]}")
+ print("Scores:")
+ for idx, sub_agent in enumerate(agent_ids):
+ print(f" {sub_agent} score: {mean_scores[idx]}")
+ print("Fitness")
+ for idx, sub_agent in enumerate(agent_ids):
+ print(f" {sub_agent} fitness: {fitness[idx]}")
+ print("Previous 5 fitness avgs")
+ for idx, sub_agent in enumerate(agent_ids):
print(
- f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+ f" {sub_agent} fitness average: {np.mean(agent.fitness[-5:], axis=0)[idx]}"
)
- # Tournament selection and population mutation
- elite, pop = tournament.select(pop)
- pop = mutations.mutation(pop)
+ # Update step counter
+ agent.steps.append(agent.steps[-1])
# Save the trained algorithm
path = "./models/MADDPG"
filename = "MADDPG_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
- elite.saveCheckpoint(save_path)
+ agent.save_checkpoint(save_path)
+
+ pbar.close()
+ env.close()
diff --git a/tutorials/AgileRL/agilerl_matd3.py b/tutorials/AgileRL/agilerl_matd3.py
index 46aefb5bf..11335b45a 100644
--- a/tutorials/AgileRL/agilerl_matd3.py
+++ b/tutorials/AgileRL/agilerl_matd3.py
@@ -2,6 +2,7 @@
Authors: Michael (https://github.com/mikepratt1), Nickua (https://github.com/nicku-a)
"""
+
import os
import numpy as np
@@ -9,19 +10,20 @@
from agilerl.components.multi_agent_replay_buffer import MultiAgentReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
-from agilerl.utils.utils import initialPopulation
+from agilerl.utils.utils import create_population
+from agilerl.vector.pz_async_vec_env import AsyncPettingZooVecEnv
from tqdm import trange
from pettingzoo.mpe import simple_speaker_listener_v4
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- print("===== AgileRL MATD3 Demo =====")
+ print("===== AgileRL Online Multi-Agent Demo =====")
# Define the network configuration
NET_CONFIG = {
"arch": "mlp", # Network architecture
- "h_size": [32, 32], # Actor hidden size
+ "hidden_size": [32, 32], # Actor hidden size
}
# Define the initial hyperparameters
@@ -31,36 +33,47 @@
# Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
"CHANNELS_LAST": False,
"BATCH_SIZE": 32, # Batch size
+ "O_U_NOISE": True, # Ornstein Uhlenbeck action noise
+ "EXPL_NOISE": 0.1, # Action noise scale
+ "MEAN_NOISE": 0.0, # Mean action noise
+ "THETA": 0.15, # Rate of mean reversion in OU noise
+ "DT": 0.01, # Timestep for OU noise
"LR_ACTOR": 0.001, # Actor learning rate
- "LR_CRITIC": 0.01, # Critic learning rate
+ "LR_CRITIC": 0.001, # Critic learning rate
"GAMMA": 0.95, # Discount factor
"MEMORY_SIZE": 100000, # Max memory buffer size
- "LEARN_STEP": 5, # Learning frequency
+ "LEARN_STEP": 100, # Learning frequency
"TAU": 0.01, # For soft update of target parameters
"POLICY_FREQ": 2, # Policy frequnecy
}
+ num_envs = 8
# Define the simple speaker listener environment as a parallel environment
env = simple_speaker_listener_v4.parallel_env(continuous_actions=True)
+ env = AsyncPettingZooVecEnv([lambda: env for _ in range(num_envs)])
env.reset()
# Configure the multi-agent algo input arguments
try:
- state_dim = [env.observation_space(agent).n for agent in env.agents]
+ state_dim = [env.single_observation_space(agent).n for agent in env.agents]
one_hot = True
except Exception:
- state_dim = [env.observation_space(agent).shape for agent in env.agents]
+ state_dim = [env.single_observation_space(agent).shape for agent in env.agents]
one_hot = False
try:
- action_dim = [env.action_space(agent).n for agent in env.agents]
+ action_dim = [env.single_action_space(agent).n for agent in env.agents]
INIT_HP["DISCRETE_ACTIONS"] = True
INIT_HP["MAX_ACTION"] = None
INIT_HP["MIN_ACTION"] = None
except Exception:
- action_dim = [env.action_space(agent).shape[0] for agent in env.agents]
+ action_dim = [env.single_action_space(agent).shape[0] for agent in env.agents]
INIT_HP["DISCRETE_ACTIONS"] = False
- INIT_HP["MAX_ACTION"] = [env.action_space(agent).high for agent in env.agents]
- INIT_HP["MIN_ACTION"] = [env.action_space(agent).low for agent in env.agents]
+ INIT_HP["MAX_ACTION"] = [
+ env.single_action_space(agent).high for agent in env.agents
+ ]
+ INIT_HP["MIN_ACTION"] = [
+ env.single_action_space(agent).low for agent in env.agents
+ ]
# Not applicable to MPE environments, used when images are used for observations (Atari environments)
if INIT_HP["CHANNELS_LAST"]:
@@ -73,7 +86,7 @@
INIT_HP["AGENT_IDS"] = env.agents
# Create a population ready for evolutionary hyper-parameter optimisation
- pop = initialPopulation(
+ pop = create_population(
INIT_HP["ALGO"],
state_dim,
action_dim,
@@ -81,6 +94,7 @@
NET_CONFIG,
INIT_HP,
population_size=INIT_HP["POPULATION_SIZE"],
+ num_envs=num_envs,
device=device,
)
@@ -98,8 +112,8 @@
tournament_size=2, # Tournament selection size
elitism=True, # Elitism in tournament selection
population_size=INIT_HP["POPULATION_SIZE"], # Population size
- evo_step=1,
- ) # Evaluate using last N fitness scores
+ eval_loop=1, # Evaluate using last N fitness scores
+ )
# Instantiate a mutations object (used for HPO)
mutations = Mutations(
@@ -123,116 +137,148 @@
)
# Define training loop parameters
- max_episodes = 500 # Total episodes (default: 6000)
- max_steps = 25 # Maximum steps to take in each episode
- epsilon = 1.0 # Starting epsilon value
- eps_end = 0.1 # Final epsilon value
- eps_decay = 0.995 # Epsilon decay
- evo_epochs = 20 # Evolution frequency
- evo_loop = 1 # Number of evaluation episodes
+ max_steps = 13000 # Max steps (default: 2000000)
+ learning_delay = 0 # Steps before starting learning
+ evo_steps = 1000 # Evolution frequency
+ eval_steps = None # Evaluation steps per episode - go until done
+ eval_loop = 1 # Number of evaluation episodes
elite = pop[0] # Assign a placeholder "elite" agent
- # Training loop
- for idx_epi in trange(max_episodes):
+ total_steps = 0
+
+ # TRAINING LOOP
+ print("Training...")
+ pbar = trange(max_steps, unit="step")
+ while np.less([agent.steps[-1] for agent in pop], max_steps).all():
+ pop_episode_scores = []
for agent in pop: # Loop through population
state, info = env.reset() # Reset environment at start of episode
- agent_reward = {agent_id: 0 for agent_id in env.agents}
+ scores = np.zeros(num_envs)
+ completed_episode_scores = []
+ steps = 0
if INIT_HP["CHANNELS_LAST"]:
state = {
- agent_id: np.moveaxis(np.expand_dims(s, 0), [-1], [-3])
+ agent_id: np.moveaxis(s, [-1], [-3])
for agent_id, s in state.items()
}
- for _ in range(max_steps):
- agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
- env_defined_actions = (
- info["env_defined_actions"]
- if "env_defined_actions" in info.keys()
- else None
- )
-
+ for idx_step in range(evo_steps // num_envs):
# Get next action from agent
- cont_actions, discrete_action = agent.getAction(
- state, epsilon, agent_mask, env_defined_actions
+ cont_actions, discrete_action = agent.get_action(
+ states=state, training=True, infos=info
)
if agent.discrete_actions:
action = discrete_action
else:
action = cont_actions
- next_state, reward, termination, truncation, info = env.step(
- action
- ) # Act in environment
+ # Act in environment
+ next_state, reward, termination, truncation, info = env.step(action)
+
+ scores += np.sum(np.array(list(reward.values())).transpose(), axis=-1)
+ total_steps += num_envs
+ steps += num_envs
# Image processing if necessary for the environment
if INIT_HP["CHANNELS_LAST"]:
- state = {agent_id: np.squeeze(s) for agent_id, s in state.items()}
next_state = {
agent_id: np.moveaxis(ns, [-1], [-3])
for agent_id, ns in next_state.items()
}
# Save experiences to replay buffer
- memory.save2memory(state, cont_actions, reward, next_state, termination)
-
- # Collect the reward
- for agent_id, r in reward.items():
- agent_reward[agent_id] += r
+ memory.save_to_memory(
+ state,
+ cont_actions,
+ reward,
+ next_state,
+ termination,
+ is_vectorised=True,
+ )
# Learn according to learning frequency
- if (memory.counter % agent.learn_step == 0) and (
- len(memory) >= agent.batch_size
+ # Handle learn steps > num_envs
+ if agent.learn_step > num_envs:
+ learn_step = agent.learn_step // num_envs
+ if (
+ idx_step % learn_step == 0
+ and len(memory) >= agent.batch_size
+ and memory.counter > learning_delay
+ ):
+ # Sample replay buffer
+ experiences = memory.sample(agent.batch_size)
+ # Learn according to agent's RL algorithm
+ agent.learn(experiences)
+ # Handle num_envs > learn step; learn multiple times per step in env
+ elif (
+ len(memory) >= agent.batch_size and memory.counter > learning_delay
):
- experiences = memory.sample(
- agent.batch_size
- ) # Sample replay buffer
- agent.learn(experiences) # Learn according to agent's RL algorithm
+ for _ in range(num_envs // agent.learn_step):
+ # Sample replay buffer
+ experiences = memory.sample(agent.batch_size)
+ # Learn according to agent's RL algorithm
+ agent.learn(experiences)
- # Update the state
- if INIT_HP["CHANNELS_LAST"]:
- next_state = {
- agent_id: np.expand_dims(ns, 0)
- for agent_id, ns in next_state.items()
- }
state = next_state
- # Stop episode if any agents have terminated
- if any(truncation.values()) or any(termination.values()):
- break
-
- # Save the total episode reward
- score = sum(agent_reward.values())
- agent.scores.append(score)
-
- # Update epsilon for exploration
- epsilon = max(eps_end, epsilon * eps_decay)
-
- # Now evolve population if necessary
- if (idx_epi + 1) % evo_epochs == 0:
- # Evaluate population
- fitnesses = [
- agent.test(
- env,
- swap_channels=INIT_HP["CHANNELS_LAST"],
- max_steps=max_steps,
- loop=evo_loop,
- )
- for agent in pop
- ]
+ # Calculate scores and reset noise for finished episodes
+ reset_noise_indices = []
+ term_array = np.array(list(termination.values())).transpose()
+ trunc_array = np.array(list(truncation.values())).transpose()
+ for idx, (d, t) in enumerate(zip(term_array, trunc_array)):
+ if np.any(d) or np.any(t):
+ completed_episode_scores.append(scores[idx])
+ agent.scores.append(scores[idx])
+ scores[idx] = 0
+ reset_noise_indices.append(idx)
+ agent.reset_action_noise(reset_noise_indices)
- print(f"Episode {idx_epi + 1}/{max_episodes}")
- print(f'Fitnesses: {["%.2f" % fitness for fitness in fitnesses]}')
- print(
- f'100 fitness avgs: {["%.2f" % np.mean(agent.fitness[-100:]) for agent in pop]}'
+ pbar.update(evo_steps // len(pop))
+
+ agent.steps[-1] += steps
+ pop_episode_scores.append(completed_episode_scores)
+
+ # Evaluate population
+ fitnesses = [
+ agent.test(
+ env,
+ swap_channels=INIT_HP["CHANNELS_LAST"],
+ max_steps=eval_steps,
+ loop=eval_loop,
)
+ for agent in pop
+ ]
+ mean_scores = [
+ (
+ np.mean(episode_scores)
+ if len(episode_scores) > 0
+ else "0 completed episodes"
+ )
+ for episode_scores in pop_episode_scores
+ ]
+
+ print(f"--- Global steps {total_steps} ---")
+ print(f"Steps {[agent.steps[-1] for agent in pop]}")
+ print(f"Scores: {mean_scores}")
+ print(f'Fitnesses: {["%.2f"%fitness for fitness in fitnesses]}')
+ print(
+ f'5 fitness avgs: {["%.2f"%np.mean(agent.fitness[-5:]) for agent in pop]}'
+ )
- # Tournament selection and population mutation
- elite, pop = tournament.select(pop)
- pop = mutations.mutation(pop)
+ # Tournament selection and population mutation
+ elite, pop = tournament.select(pop)
+ pop = mutations.mutation(pop)
+
+ # Update step counter
+ for agent in pop:
+ agent.steps.append(agent.steps[-1])
# Save the trained algorithm
path = "./models/MATD3"
filename = "MATD3_trained_agent.pt"
os.makedirs(path, exist_ok=True)
save_path = os.path.join(path, filename)
- elite.saveCheckpoint(save_path)
+ elite.save_checkpoint(save_path)
+
+ pbar.close()
+ env.close()
diff --git a/tutorials/AgileRL/render_agilerl_dqn.py b/tutorials/AgileRL/render_agilerl_dqn.py
index f5a2d4b38..67d3ad9cc 100644
--- a/tutorials/AgileRL/render_agilerl_dqn.py
+++ b/tutorials/AgileRL/render_agilerl_dqn.py
@@ -4,7 +4,7 @@
import numpy as np
import torch
from agilerl.algorithms.dqn import DQN
-from agilerl_dqn_curriculum import Opponent
+from agilerl_dqn_curriculum import Opponent, transform_and_flip
from PIL import Image, ImageDraw, ImageFont
from pettingzoo.classic import connect_four_v3
@@ -68,16 +68,8 @@ def resize_frames(frames, fraction):
state_dim = np.zeros(state_dim[0]).flatten().shape
action_dim = action_dim[0]
- # Instantiate an DQN object
- dqn = DQN(
- state_dim,
- action_dim,
- one_hot,
- device=device,
- )
-
- # Load the saved algorithm into the DQN object
- dqn.loadCheckpoint(path)
+ # Load the saved agent
+ dqn = DQN.load(path, device)
for opponent_difficulty in ["random", "weak", "strong", "self"]:
# Create opponent
@@ -120,38 +112,35 @@ def resize_frames(frames, fraction):
for idx_step in range(max_steps):
action_mask = observation["action_mask"]
if player < 0:
- state = np.moveaxis(observation["observation"], [-1], [-3])
- state = np.expand_dims(state, 0)
+ state, _ = transform_and_flip(observation, player=0)
if opponent_first:
if opponent_difficulty == "self":
- action = opponent.getAction(
+ action = opponent.get_action(
state, epsilon=0, action_mask=action_mask
)[0]
elif opponent_difficulty == "random":
- action = opponent.getAction(action_mask)
+ action = opponent.get_action(action_mask)
else:
- action = opponent.getAction(player=0)
+ action = opponent.get_action(player=0)
else:
- action = dqn.getAction(
+ action = dqn.get_action(
state, epsilon=0, action_mask=action_mask
)[
0
] # Get next action from agent
if player > 0:
- state = np.moveaxis(observation["observation"], [-1], [-3])
- state[[0, 1], :, :] = state[[0, 1], :, :]
- state = np.expand_dims(state, 0)
+ state, _ = transform_and_flip(observation, player=1)
if not opponent_first:
if opponent_difficulty == "self":
- action = opponent.getAction(
+ action = opponent.get_action(
state, epsilon=0, action_mask=action_mask
)[0]
elif opponent_difficulty == "random":
- action = opponent.getAction(action_mask)
+ action = opponent.get_action(action_mask)
else:
- action = opponent.getAction(player=1)
+ action = opponent.get_action(player=1)
else:
- action = dqn.getAction(
+ action = dqn.get_action(
state, epsilon=0, action_mask=action_mask
)[
0
diff --git a/tutorials/AgileRL/render_agilerl_maddpg.py b/tutorials/AgileRL/render_agilerl_maddpg.py
index ca47349d5..2713b48fd 100644
--- a/tutorials/AgileRL/render_agilerl_maddpg.py
+++ b/tutorials/AgileRL/render_agilerl_maddpg.py
@@ -68,22 +68,9 @@ def _label_with_episode_number(frame, episode_num):
n_agents = env.num_agents
agent_ids = env.agents
- # Instantiate an MADDPG object
- maddpg = MADDPG(
- state_dim,
- action_dim,
- one_hot,
- n_agents,
- agent_ids,
- max_action,
- min_action,
- discrete_actions,
- device=device,
- )
-
- # Load the saved algorithm into the MADDPG object
+ # Load the saved agent
path = "./models/MADDPG/MADDPG_trained_agent.pt"
- maddpg.loadCheckpoint(path)
+ maddpg = MADDPG.load(path, device)
# Define test loop parameters
episodes = 10 # Number of episodes to test agent on
@@ -106,20 +93,9 @@ def _label_with_episode_number(frame, episode_num):
agent_id: np.moveaxis(np.expand_dims(s, 0), [3], [1])
for agent_id, s in state.items()
}
-
- agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
- env_defined_actions = (
- info["env_defined_actions"]
- if "env_defined_actions" in info.keys()
- else None
- )
-
# Get next action from agent
- cont_actions, discrete_action = maddpg.getAction(
- state,
- epsilon=0,
- agent_mask=agent_mask,
- env_defined_actions=env_defined_actions,
+ cont_actions, discrete_action = maddpg.get_action(
+ state, training=False, infos=info
)
if maddpg.discrete_actions:
action = discrete_action
@@ -131,7 +107,9 @@ def _label_with_episode_number(frame, episode_num):
frames.append(_label_with_episode_number(frame, episode_num=ep))
# Take action in environment
- state, reward, termination, truncation, info = env.step(action)
+ state, reward, termination, truncation, info = env.step(
+ {agent: a.squeeze() for agent, a in action.items()}
+ )
# Save agent's reward for this step in this episode
for agent_id, r in reward.items():
diff --git a/tutorials/AgileRL/render_agilerl_matd3.py b/tutorials/AgileRL/render_agilerl_matd3.py
index efcc610cd..8bfae5673 100644
--- a/tutorials/AgileRL/render_agilerl_matd3.py
+++ b/tutorials/AgileRL/render_agilerl_matd3.py
@@ -55,22 +55,9 @@ def _label_with_episode_number(frame, episode_num):
n_agents = env.num_agents
agent_ids = env.agents
- # Instantiate an MADDPG object
- matd3 = MATD3(
- state_dim,
- action_dim,
- one_hot,
- n_agents,
- agent_ids,
- max_action,
- min_action,
- discrete_actions,
- device=device,
- )
-
- # Load the saved algorithm into the MADDPG object
+ # Load the saved agent
path = "./models/MATD3/MATD3_trained_agent.pt"
- matd3.loadCheckpoint(path)
+ matd3 = MATD3.load(path, device)
# Define test loop parameters
episodes = 10 # Number of episodes to test agent on
@@ -94,19 +81,9 @@ def _label_with_episode_number(frame, episode_num):
agent_reward = {agent_id: 0 for agent_id in agent_ids}
score = 0
for _ in range(max_steps):
- agent_mask = info["agent_mask"] if "agent_mask" in info.keys() else None
- env_defined_actions = (
- info["env_defined_actions"]
- if "env_defined_actions" in info.keys()
- else None
- )
-
# Get next action from agent
- cont_actions, discrete_action = matd3.getAction(
- state,
- epsilon=0,
- agent_mask=agent_mask,
- env_defined_actions=env_defined_actions,
+ cont_actions, discrete_action = matd3.get_action(
+ state, training=False, infos=info
)
if matd3.discrete_actions:
action = discrete_action
@@ -118,7 +95,9 @@ def _label_with_episode_number(frame, episode_num):
frames.append(_label_with_episode_number(frame, episode_num=ep))
# Take action in environment
- state, reward, termination, truncation, info = env.step(action)
+ state, reward, termination, truncation, info = env.step(
+ {agent: a.squeeze() for agent, a in action.items()}
+ )
# Save agent's reward for this step in this episode
for agent_id, r in reward.items():
diff --git a/tutorials/AgileRL/requirements.txt b/tutorials/AgileRL/requirements.txt
index dbdd050a0..35b6d42a9 100644
--- a/tutorials/AgileRL/requirements.txt
+++ b/tutorials/AgileRL/requirements.txt
@@ -1,5 +1,4 @@
-agilerl==0.1.21; python_version >= '3.9'
-agilerl==0.1.20; python_version < '3.9'
+agilerl==1.0.16; python_version >= '3.10'
pettingzoo[classic,atari,mpe]>=1.23.1
SuperSuit>=3.9.0
torch>=2.0.1
From 0fa238a987c8d1c13f0318faa648614e41cc4f29 Mon Sep 17 00:00:00 2001
From: Jordan Terry
Date: Tue, 12 Nov 2024 13:29:10 -0500
Subject: [PATCH 18/22] Update README.md
---
README.md | 5 -----
1 file changed, 5 deletions(-)
diff --git a/README.md b/README.md
index 395c9cbe9..a55f74106 100644
--- a/README.md
+++ b/README.md
@@ -73,11 +73,6 @@ SuperSuit is a library that includes all commonly used wrappers in RL (frame sta
PettingZoo keeps strict versioning for reproducibility reasons. All environments end in a suffix like "\_v0". When changes are made to environments that might impact learning results, the number is increased by one to prevent potential confusion.
-## Project Maintainers
-Project Manager: [Elliot Tower](https://github.com/elliottower/)
-
-Maintenance for this project is also contributed by the broader Farama team: [farama.org/team](https://farama.org/team).
-
## Citation
To cite this project in publication, please use
From 5ea881c2b4ef3aef382dbda448e226d46f17e1e2 Mon Sep 17 00:00:00 2001
From: Mario Jerez <72274387+mariojerez@users.noreply.github.com>
Date: Sun, 24 Nov 2024 14:46:44 -0600
Subject: [PATCH 19/22] Fixed error in render function for
CustomActionMaskedEnvironment Class (#1239)
---
tutorials/CustomEnvironment/tutorial3_action_masking.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tutorials/CustomEnvironment/tutorial3_action_masking.py b/tutorials/CustomEnvironment/tutorial3_action_masking.py
index 24676373f..c0dfe2170 100644
--- a/tutorials/CustomEnvironment/tutorial3_action_masking.py
+++ b/tutorials/CustomEnvironment/tutorial3_action_masking.py
@@ -193,7 +193,7 @@ def step(self, actions):
def render(self):
"""Renders the environment."""
- grid = np.zeros((7, 7))
+ grid = np.zeros((8, 8), dtype=object)
grid[self.prisoner_y, self.prisoner_x] = "P"
grid[self.guard_y, self.guard_x] = "G"
grid[self.escape_y, self.escape_x] = "E"
From d6bd1107d8350ad14c7d9c9b804ee3ca83c2d74d Mon Sep 17 00:00:00 2001
From: Matthew Sbar
Date: Sun, 24 Nov 2024 14:48:53 -0600
Subject: [PATCH 20/22] Chore: typos in aec.md (#1234)
---
docs/api/aec.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/api/aec.md b/docs/api/aec.md
index 8396c71c9..9248adccc 100644
--- a/docs/api/aec.md
+++ b/docs/api/aec.md
@@ -94,8 +94,8 @@ The [_Agent Environment Cycle_](https://arxiv.org/abs/2009.13051) (AEC) model wa
In an AEC environment, agents act sequentially, receiving updated observations and rewards before taking an action. The environment updates after each agent's step, making it a natural way of representing sequential games such as Chess. The AEC model is flexible enough to handle any type of game that multi-agent RL can consider.
-with the underlying environment updating after each agent's step. Agents receive updated observations and rewards at the beginning of their . The environment is updated after every step,
-This is a natural way of representing sequential games such as Chess, and
+with the underlying environment updating after each agent's step. Agents receive updated observations and rewards at the beginning of their turn. The environment is updated after every step,
+This is a natural way of representing sequential games such as Chess and Go.
```{figure} /_static/img/aec_cycle_figure.png
:width: 480px
From c1260f74a3b2eade64b64508fac2c1e4bdadfae4 Mon Sep 17 00:00:00 2001
From: David GERARD
Date: Mon, 2 Dec 2024 16:06:25 +0000
Subject: [PATCH 21/22] [Fix] #1242 bug report broken tests for connect four in
ci (#1243)
---
.github/workflows/linux-tutorials-test.yml | 3 ++-
docs/tutorials/sb3/connect_four.md | 7 +++++++
tutorials/SB3/connect_four/requirements.txt | 1 +
tutorials/SB3/connect_four/sb3_connect_four_action_mask.py | 6 ++++++
4 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/linux-tutorials-test.yml b/.github/workflows/linux-tutorials-test.yml
index f74a9b3c5..d559302eb 100644
--- a/.github/workflows/linux-tutorials-test.yml
+++ b/.github/workflows/linux-tutorials-test.yml
@@ -15,9 +15,10 @@ jobs:
runs-on: ubuntu-latest
strategy:
fail-fast: false
+
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']
- tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/connect_four, SB3/test] # TODO: fix tutorials and add back Ray
+ tutorial: [Tianshou, CustomEnvironment, CleanRL, SB3/kaz, SB3/waterworld, SB3/test] # TODO: fix tutorials and add back Ray, fix SB3/connect_four tutorial
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
diff --git a/docs/tutorials/sb3/connect_four.md b/docs/tutorials/sb3/connect_four.md
index 8b85f8cca..eef34deac 100644
--- a/docs/tutorials/sb3/connect_four.md
+++ b/docs/tutorials/sb3/connect_four.md
@@ -4,6 +4,13 @@ title: "SB3: Action Masked PPO for Connect Four"
# SB3: Action Masked PPO for Connect Four
+```{eval-rst}
+.. warning::
+
+ Currently, this tutorial doesn't work with versions of gymnasium>0.29.1. We are looking into fixing it but it might take some time.
+
+```
+
This tutorial shows how to train a agents using Maskable [Proximal Policy Optimization](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html) (PPO) on the [Connect Four](/environments/classic/chess/) environment ([AEC](/api/aec/)).
It creates a custom Wrapper to convert to a [Gymnasium](https://gymnasium.farama.org/)-like environment which is compatible with [SB3 action masking](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html).
diff --git a/tutorials/SB3/connect_four/requirements.txt b/tutorials/SB3/connect_four/requirements.txt
index bf7c59673..e8ed650ab 100644
--- a/tutorials/SB3/connect_four/requirements.txt
+++ b/tutorials/SB3/connect_four/requirements.txt
@@ -1,3 +1,4 @@
pettingzoo[classic]>=1.24.0
stable-baselines3>=2.0.0
sb3-contrib>=2.0.0
+gymnasium<=0.29.1
diff --git a/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py b/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
index 29d623251..e3dc63d34 100644
--- a/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
+++ b/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
@@ -9,6 +9,7 @@
import os
import time
+import gymnasium as gym
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker
@@ -174,6 +175,11 @@ def eval_action_mask(env_fn, num_games=100, render_mode=None, **env_kwargs):
if __name__ == "__main__":
+ if gym.__version__ > "0.29.1":
+ raise ImportError(
+ f"This script requires gymnasium version 0.29.1 or lower, but you have version {gym.__version__}."
+ )
+
env_fn = connect_four_v3
env_kwargs = {}
From 6d0a827f74268e201604f18bcfd946225ba14ad7 Mon Sep 17 00:00:00 2001
From: David GERARD
Date: Mon, 2 Dec 2024 16:07:06 +0000
Subject: [PATCH 22/22] [Chore] update maintainer section (#1241)
---
README.md | 3 +++
1 file changed, 3 insertions(+)
diff --git a/README.md b/README.md
index a55f74106..adb30215b 100644
--- a/README.md
+++ b/README.md
@@ -87,3 +87,6 @@ To cite this project in publication, please use
year={2021}
}
```
+## Project Maintainers
+- Project Manager: [David Gerard](https://github.com/David-GERARD) - `david.gerard.23@ucl.ac.uk`.
+- Maintenance for this project is also contributed by the broader Farama team: [farama.org/team](https://farama.org/team).