Skip to content

Commit

Permalink
Merge pull request #57 from Farama-Foundation/extend-bpd-ig
Browse files Browse the repository at this point in the history
Extend IG with reward modes, make terms uniform across IG and BPD
  • Loading branch information
rradules authored Jun 18, 2024
2 parents 2be31ad + 70a15fe commit e4966dd
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 135 deletions.
26 changes: 15 additions & 11 deletions momaland/envs/beach/beach.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import functools
import random
import warnings
from typing_extensions import override

import numpy as np
Expand Down Expand Up @@ -69,11 +70,11 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
The action space is a Discrete space [0, 1, 2], corresponding to moving left, moving right, staying in place.
## Reward Space
The reward space is a 2D vector containing rewards for two different schemes ('local' or 'global') for:
The reward space is a 2D vector containing rewards for two different modes ('individual' or 'team') for:
- the occupation level
- the mixture level
If the scheme is 'local', the reward is given for the currently occupied section.
If the scheme is 'global', the reward is summed over all sections.
If the mode is 'individual', the reward is given for the currently occupied section.
If the mode is 'team', the reward is summed over all sections.
## Starting State
The initial position is a uniform random distribution of agents over the sections. This can be changed via the
Expand All @@ -90,7 +91,7 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
## Arguments
- 'num_timesteps (int)': number of timesteps in the domain. Default: 1
- 'num_agents (int)': number of agents in the domain. Default: 100
- 'reward_scheme (str)': the reward scheme to use ('local', or 'global'). Default: local
- 'reward_mode (str)': the reward mode to use ('individual', or 'team'). Default: individual
- 'sections (int)': number of beach sections in the domain. Default: 6
- 'capacity (int)': capacity of each beach section. Default: 7
- 'type_distribution (tuple)': the distribution of agent types in the domain. Default: 2 types equally distributed (0.3, 0.7).
Expand All @@ -104,7 +105,7 @@ def __init__(
self,
num_timesteps=1,
num_agents=100,
reward_scheme="local",
reward_mode="individual",
sections=6,
capacity=7,
type_distribution=(0.3, 0.7),
Expand All @@ -117,26 +118,29 @@ def __init__(
sections: number of beach sections in the domain
capacity: capacity of each beach section
num_agents: number of agents in the domain
reward_mode: the reward mode to use ('individual', or 'team'). Default: individual
type_distribution: the distribution of agent types in the domain. Default: 2 types equally distributed.
position_distribution: the initial distribution of agents in the domain. Default: uniform over all sections.
num_timesteps: number of timesteps in the domain
render_mode: render mode
reward_scheme: the reward scheme to use ('local', or 'global'). Default: local
"""
EzPickle.__init__(
self,
num_timesteps,
num_agents,
reward_scheme,
reward_mode,
sections,
capacity,
type_distribution,
position_distribution,
render_mode,
)
self.reward_scheme = reward_scheme
if reward_mode not in ["individual", "team"]:
self.reward_mode = "individual"
warnings.warn("Invalid reward_mode. Must be either 'individual' or 'team'. Defaulting to 'individual'.")
else:
self.reward_mode = reward_mode
self.sections = sections
# TODO Extend to distinct capacities per section?
self.resource_capacities = [capacity for _ in range(sections)]
self.num_timesteps = num_timesteps
self.episode_num = 0
Expand Down Expand Up @@ -296,13 +300,13 @@ def step(self, actions):
reward_per_section = np.zeros((self.sections, NUM_OBJECTIVES), dtype=np.float32)

if env_termination:
if self.reward_scheme == "local":
if self.reward_mode == "individual":
for i in range(self.sections):
lr_capacity = _local_capacity_reward(self.resource_capacities[i], section_consumptions[i])
lr_mixture = _local_mixture_reward(section_agent_types[i])
reward_per_section[i] = np.array([lr_capacity, lr_mixture])

elif self.reward_scheme == "global":
elif self.reward_mode == "team":
g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
g_mixture = _global_mixture_reward(section_agent_types)
reward_per_section = np.array([[g_capacity, g_mixture]] * self.sections)
Expand Down
19 changes: 16 additions & 3 deletions momaland/envs/item_gathering/item_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@
Notes:
- In contrast to the original environment, the observation space is a 2D array of integers, i.e.,
the map of the environment, where each integer represents either agents (1 for the agent receiving the observation,
2 for the other agents) or items (3, 4, etc., depending on the number of items).
the map of the environment, with 0 for empty cells, negative integers for agents, positive integers for items.
- The number of agents and items is configurable, by providing an initial map.
- If no initial map is provided, the environment uses a default map
Central observation:
- If the central_observation flag is set to True, then the environment implements:
- If the central_observation flag is set to True, then the environment includes in the implementation:
- a central observation space: self.central_observation_space
- a central observation function: self.state()
The central_observation flag and the associated methods described above are used by the CentralisedAgent wrapper
"""

import random
import warnings
from copy import deepcopy
from os import path
from typing_extensions import override
Expand Down Expand Up @@ -102,6 +103,7 @@ class MOItemGathering(MOParallelEnv, EzPickle):
- 'num_timesteps': number of timesteps to run the environment for. Default: 10
- 'initial_map': map of the environment. Default: 8x8 grid, 2 agents, 3 objectives (Källström and Heintz, 2019)
- 'randomise': whether to randomise the map, at each episode. Default: False
- 'reward_mode': reward mode for the environment ('individual' or 'team'). Default: 'individual'
- 'render_mode': render mode for the environment. Default: None
"""

Expand All @@ -118,6 +120,7 @@ def __init__(
num_timesteps=10,
initial_map=DEFAULT_MAP,
randomise=False,
reward_mode="individual",
render_mode=None,
):
"""Initializes the item gathering domain.
Expand All @@ -126,19 +129,26 @@ def __init__(
num_timesteps: number of timesteps to run the environment for
initial_map: map of the environment
randomise: whether to randomise the map, at each episode
reward_mode: reward mode for the environment, 'individual' or 'team'. Default: 'individual'
render_mode: render mode for the environment
"""
EzPickle.__init__(
self,
num_timesteps,
initial_map,
randomise,
reward_mode,
render_mode,
)
self.num_timesteps = num_timesteps
self.current_timestep = 0
self.render_mode = render_mode
self.randomise = randomise
if reward_mode not in ["individual", "team"]:
self.reward_mode = "individual"
warnings.warn("reward_mode must be either 'individual' or 'team', defaulting to 'individual'.")
else:
self.reward_mode = reward_mode

# check if the initial map has any entries equal to 1
assert len(np.argwhere(initial_map == 1).flatten()) > 0, "The initial map does not contain any agents (1s)."
Expand Down Expand Up @@ -391,6 +401,9 @@ def step(self, actions):
if value_in_cell > 0:
rewards[self.agents[i]][self.item_dict[value_in_cell]] += 1
self.env_map[self.agent_positions[i][0], self.agent_positions[i][1]] = 0
# if reward mode is teams, sum the rewards for all agents
if self.reward_mode == "team":
rewards = {agent: np.sum(list(rewards.values()), axis=0) for agent in self.agents}

map_obs = self.state()
observations = {agent: (-(i + 1), map_obs) for i, agent in enumerate(self.agents)}
Expand Down
8 changes: 4 additions & 4 deletions momaland/learning/iql/tabular_bpd.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ def normalize_objective_rewards(self, reward, reward_scheme):
np.array: the normalized reward
"""
# Set the normalization constants
if reward_scheme == "local":
if reward_scheme == "individual":
cap_min, cap_max, mix_min, mix_max = self.l_cap_min, self.l_cap_max, self.l_mix_min, self.l_mix_max
elif reward_scheme == "global":
elif reward_scheme == "team":
cap_min, cap_max, mix_min, mix_max = self.g_cap_min, self.g_cap_max, self.g_mix_min, self.g_mix_max
else:
raise ValueError(f"Unknown reward scheme: {reward_scheme}")
Expand Down Expand Up @@ -108,15 +108,15 @@ def step(self, actions):
section_agent_types[self._state[i]][self._types[i]] += 1
g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
g_mixture = _global_mixture_reward(section_agent_types)
g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "global")
g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "team")
infos = {
agent: {"g_cap": g_capacity, "g_mix": g_mixture, "g_cap_norm": g_capacity_norm, "g_mix_norm": g_mixture_norm}
for agent in self.possible_agents
}

# Normalize the rewards
for agent in self.possible_agents:
rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_scheme)
rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_mode)

return observations, rewards, terminations, truncations, infos

Expand Down
12 changes: 6 additions & 6 deletions momaland/learning/iql/train_iql_bpd.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
# Maximum local capacity is achieved when there are 'capacity' agents in the section
max_cap_local = _local_capacity_reward(capacity, capacity)
cap_min = 0.0
cap_max = max_cap_local if reward_scheme == "local" else max_cap_global
cap_max = max_cap_local if reward_scheme == "individual" else max_cap_global

# Mixture
# Maximum global mixture: one agent of each type in each section, except one where all other agents are
Expand All @@ -52,7 +52,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
# Maximum local mixture is achieved when there is one agent of each type in the section
max_mix_local = _local_mixture_reward([1, 1])
mix_min = 0.0
mix_max = max_mix_local if reward_scheme == "local" else max_mix_global
mix_max = max_mix_local if reward_scheme == "individual" else max_mix_global

return cap_min, cap_max, mix_min, mix_max

Expand Down Expand Up @@ -96,7 +96,7 @@ def parse_args():
parser.add_argument('--position-distribution', type=float, nargs=5, default=[0., 0.5, 0., 0.5, 0.], )
parser.add_argument('--sections', type=int, default=5, )
parser.add_argument('--capacity', type=int, default=3, )
parser.add_argument('--reward-scheme', type=str, default="local", help="the reward scheme to use")
parser.add_argument('--reward-scheme', type=str, default="individual", help="the reward scheme to use")

args = parser.parse_args()
args.time = time.time()
Expand All @@ -114,13 +114,13 @@ def parse_args():
"position_distribution": args.position_distribution,
"sections": args.sections,
"capacity": args.capacity,
"reward_scheme": args.reward_scheme,
"reward_mode": args.reward_scheme,
# Normalization constants
"local_constants": compute_normalization_constants(
args.num_agents, args.sections, args.capacity, args.type_distribution, "local"
args.num_agents, args.sections, args.capacity, args.type_distribution, "individual"
),
"global_constants": compute_normalization_constants(
args.num_agents, args.sections, args.capacity, args.type_distribution, "global"
args.num_agents, args.sections, args.capacity, args.type_distribution, "team"
),
}

Expand Down
9 changes: 5 additions & 4 deletions momaland/learning/morl/random_centralised_agent_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def train_random(moma_env):
num_timesteps=50,
initial_map=test_map,
randomise=True,
reward_mode="individual",
render_mode=None,
)

Expand All @@ -54,11 +55,11 @@ def train_random(moma_env):
type_distribution=[0.5, 0.5],
position_distribution=[0.5, 1],
num_timesteps=10,
reward_scheme="local",
reward_mode="individual",
)

# train_random(ig_env)
# train_random(mobpd_env)
train_random(ig_env)
train_random(mobpd_env)

# train_sa_random(ig_env)
train_sa_random(mobpd_env)
# train_sa_random(mobpd_env)
4 changes: 2 additions & 2 deletions momaland/learning/morl/sa_env_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def make_single_agent_bpd_env(size="small"):
bpd_env = mobeach_v0.parallel_env(
num_timesteps=5,
num_agents=10,
reward_scheme="global",
reward_mode="team",
sections=3,
capacity=2,
type_distribution=(0.7, 0.3),
Expand All @@ -44,7 +44,7 @@ def make_single_agent_bpd_env(size="small"):
bpd_env = moitem_gathering_v0.parallel_env(
num_timesteps=1,
num_agents=50,
reward_scheme="global",
reward_mode="team",
sections=5,
capacity=3,
type_distribution=(0.7, 0.3),
Expand Down
58 changes: 0 additions & 58 deletions momaland/learning/morl/train_bpd_GPILS.py

This file was deleted.

47 changes: 0 additions & 47 deletions momaland/learning/morl/train_bpd_PCN.py

This file was deleted.

0 comments on commit e4966dd

Please sign in to comment.