Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend IG with reward modes, make terms uniform across IG and BPD #57

Merged
merged 8 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions momaland/envs/beach/beach.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import functools
import random
import warnings
from typing_extensions import override

import numpy as np
Expand Down Expand Up @@ -69,11 +70,11 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
The action space is a Discrete space [0, 1, 2], corresponding to moving left, moving right, staying in place.

## Reward Space
The reward space is a 2D vector containing rewards for two different schemes ('local' or 'global') for:
The reward space is a 2D vector containing rewards for two different modes ('individual' or 'team') for:
- the occupation level
- the mixture level
If the scheme is 'local', the reward is given for the currently occupied section.
If the scheme is 'global', the reward is summed over all sections.
If the mode is 'individual', the reward is given for the currently occupied section.
If the mode is 'team', the reward is summed over all sections.

## Starting State
The initial position is a uniform random distribution of agents over the sections. This can be changed via the
Expand All @@ -90,7 +91,7 @@ class MOBeachDomain(MOParallelEnv, EzPickle):
## Arguments
- 'num_timesteps (int)': number of timesteps in the domain. Default: 1
- 'num_agents (int)': number of agents in the domain. Default: 100
- 'reward_scheme (str)': the reward scheme to use ('local', or 'global'). Default: local
- 'reward_mode (str)': the reward mode to use ('individual', or 'team'). Default: individual
- 'sections (int)': number of beach sections in the domain. Default: 6
- 'capacity (int)': capacity of each beach section. Default: 7
- 'type_distribution (tuple)': the distribution of agent types in the domain. Default: 2 types equally distributed (0.3, 0.7).
Expand All @@ -104,7 +105,7 @@ def __init__(
self,
num_timesteps=1,
num_agents=100,
reward_scheme="local",
reward_mode="individual",
sections=6,
capacity=7,
type_distribution=(0.3, 0.7),
Expand All @@ -117,26 +118,29 @@ def __init__(
sections: number of beach sections in the domain
capacity: capacity of each beach section
num_agents: number of agents in the domain
reward_mode: the reward mode to use ('individual', or 'team'). Default: individual
type_distribution: the distribution of agent types in the domain. Default: 2 types equally distributed.
position_distribution: the initial distribution of agents in the domain. Default: uniform over all sections.
num_timesteps: number of timesteps in the domain
render_mode: render mode
reward_scheme: the reward scheme to use ('local', or 'global'). Default: local
"""
EzPickle.__init__(
self,
num_timesteps,
num_agents,
reward_scheme,
reward_mode,
sections,
capacity,
type_distribution,
position_distribution,
render_mode,
)
self.reward_scheme = reward_scheme
if reward_mode not in ["individual", "team"]:
self.reward_mode = "individual"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we throw an exception in that case?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed in the meantime to throw a warning and fall back on the individual setting.

warnings.warn("Invalid reward_mode. Must be either 'individual' or 'team'. Defaulting to 'individual'.")
else:
self.reward_mode = reward_mode
self.sections = sections
# TODO Extend to distinct capacities per section?
self.resource_capacities = [capacity for _ in range(sections)]
self.num_timesteps = num_timesteps
self.episode_num = 0
Expand Down Expand Up @@ -296,13 +300,13 @@ def step(self, actions):
reward_per_section = np.zeros((self.sections, NUM_OBJECTIVES), dtype=np.float32)

if env_termination:
if self.reward_scheme == "local":
if self.reward_mode == "individual":
for i in range(self.sections):
lr_capacity = _local_capacity_reward(self.resource_capacities[i], section_consumptions[i])
lr_mixture = _local_mixture_reward(section_agent_types[i])
reward_per_section[i] = np.array([lr_capacity, lr_mixture])

elif self.reward_scheme == "global":
elif self.reward_mode == "team":
g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
g_mixture = _global_mixture_reward(section_agent_types)
reward_per_section = np.array([[g_capacity, g_mixture]] * self.sections)
Expand Down
19 changes: 16 additions & 3 deletions momaland/envs/item_gathering/item_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@

Notes:
- In contrast to the original environment, the observation space is a 2D array of integers, i.e.,
the map of the environment, where each integer represents either agents (1 for the agent receiving the observation,
2 for the other agents) or items (3, 4, etc., depending on the number of items).
the map of the environment, with 0 for empty cells, negative integers for agents, positive integers for items.
- The number of agents and items is configurable, by providing an initial map.
- If no initial map is provided, the environment uses a default map

Central observation:
- If the central_observation flag is set to True, then the environment implements:
- If the central_observation flag is set to True, then the environment includes in the implementation:
- a central observation space: self.central_observation_space
- a central observation function: self.state()
The central_observation flag and the associated methods described above are used by the CentralisedAgent wrapper
"""

import random
import warnings
from copy import deepcopy
from os import path
from typing_extensions import override
Expand Down Expand Up @@ -102,6 +103,7 @@ class MOItemGathering(MOParallelEnv, EzPickle):
- 'num_timesteps': number of timesteps to run the environment for. Default: 10
- 'initial_map': map of the environment. Default: 8x8 grid, 2 agents, 3 objectives (Källström and Heintz, 2019)
- 'randomise': whether to randomise the map, at each episode. Default: False
- 'reward_mode': reward mode for the environment ('individual' or 'team'). Default: 'individual'
- 'render_mode': render mode for the environment. Default: None
"""

Expand All @@ -118,6 +120,7 @@ def __init__(
num_timesteps=10,
initial_map=DEFAULT_MAP,
randomise=False,
reward_mode="individual",
render_mode=None,
):
"""Initializes the item gathering domain.
Expand All @@ -126,19 +129,26 @@ def __init__(
num_timesteps: number of timesteps to run the environment for
initial_map: map of the environment
randomise: whether to randomise the map, at each episode
reward_mode: reward mode for the environment, 'individual' or 'team'. Default: 'individual'
render_mode: render mode for the environment
"""
EzPickle.__init__(
self,
num_timesteps,
initial_map,
randomise,
reward_mode,
render_mode,
)
self.num_timesteps = num_timesteps
self.current_timestep = 0
self.render_mode = render_mode
self.randomise = randomise
if reward_mode not in ["individual", "team"]:
self.reward_mode = "individual"
warnings.warn("reward_mode must be either 'individual' or 'team', defaulting to 'individual'.")
else:
self.reward_mode = reward_mode

# check if the initial map has any entries equal to 1
assert len(np.argwhere(initial_map == 1).flatten()) > 0, "The initial map does not contain any agents (1s)."
Expand Down Expand Up @@ -391,6 +401,9 @@ def step(self, actions):
if value_in_cell > 0:
rewards[self.agents[i]][self.item_dict[value_in_cell]] += 1
self.env_map[self.agent_positions[i][0], self.agent_positions[i][1]] = 0
# if reward mode is teams, sum the rewards for all agents
if self.reward_mode == "team":
rewards = {agent: np.sum(list(rewards.values()), axis=0) for agent in self.agents}

map_obs = self.state()
observations = {agent: (-(i + 1), map_obs) for i, agent in enumerate(self.agents)}
Expand Down
8 changes: 4 additions & 4 deletions momaland/learning/iql/tabular_bpd.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ def normalize_objective_rewards(self, reward, reward_scheme):
np.array: the normalized reward
"""
# Set the normalization constants
if reward_scheme == "local":
if reward_scheme == "individual":
cap_min, cap_max, mix_min, mix_max = self.l_cap_min, self.l_cap_max, self.l_mix_min, self.l_mix_max
elif reward_scheme == "global":
elif reward_scheme == "team":
cap_min, cap_max, mix_min, mix_max = self.g_cap_min, self.g_cap_max, self.g_mix_min, self.g_mix_max
else:
raise ValueError(f"Unknown reward scheme: {reward_scheme}")
Expand Down Expand Up @@ -108,15 +108,15 @@ def step(self, actions):
section_agent_types[self._state[i]][self._types[i]] += 1
g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions)
g_mixture = _global_mixture_reward(section_agent_types)
g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "global")
g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "team")
infos = {
agent: {"g_cap": g_capacity, "g_mix": g_mixture, "g_cap_norm": g_capacity_norm, "g_mix_norm": g_mixture_norm}
for agent in self.possible_agents
}

# Normalize the rewards
for agent in self.possible_agents:
rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_scheme)
rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_mode)

return observations, rewards, terminations, truncations, infos

Expand Down
12 changes: 6 additions & 6 deletions momaland/learning/iql/train_iql_bpd.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
# Maximum local capacity is achieved when there are 'capacity' agents in the section
max_cap_local = _local_capacity_reward(capacity, capacity)
cap_min = 0.0
cap_max = max_cap_local if reward_scheme == "local" else max_cap_global
cap_max = max_cap_local if reward_scheme == "individual" else max_cap_global

# Mixture
# Maximum global mixture: one agent of each type in each section, except one where all other agents are
Expand All @@ -52,7 +52,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib
# Maximum local mixture is achieved when there is one agent of each type in the section
max_mix_local = _local_mixture_reward([1, 1])
mix_min = 0.0
mix_max = max_mix_local if reward_scheme == "local" else max_mix_global
mix_max = max_mix_local if reward_scheme == "individual" else max_mix_global

return cap_min, cap_max, mix_min, mix_max

Expand Down Expand Up @@ -94,7 +94,7 @@ def parse_args():
parser.add_argument('--position-distribution', type=float, nargs=5, default=[0., 0.5, 0., 0.5, 0.], )
parser.add_argument('--sections', type=int, default=5, )
parser.add_argument('--capacity', type=int, default=3, )
parser.add_argument('--reward-scheme', type=str, default="local", help="the reward scheme to use")
parser.add_argument('--reward-scheme', type=str, default="individual", help="the reward scheme to use")

args = parser.parse_args()
args.time = time.time()
Expand All @@ -112,13 +112,13 @@ def parse_args():
"position_distribution": args.position_distribution,
"sections": args.sections,
"capacity": args.capacity,
"reward_scheme": args.reward_scheme,
"reward_mode": args.reward_scheme,
# Normalization constants
"local_constants": compute_normalization_constants(
args.num_agents, args.sections, args.capacity, args.type_distribution, "local"
args.num_agents, args.sections, args.capacity, args.type_distribution, "individual"
),
"global_constants": compute_normalization_constants(
args.num_agents, args.sections, args.capacity, args.type_distribution, "global"
args.num_agents, args.sections, args.capacity, args.type_distribution, "team"
),
}

Expand Down
5 changes: 3 additions & 2 deletions momaland/learning/morl/random_centralised_agent_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def train_random(moma_env):
num_timesteps=50,
initial_map=test_map,
randomise=True,
reward_mode="test",
rradules marked this conversation as resolved.
Show resolved Hide resolved
render_mode=None,
)

Expand All @@ -57,8 +58,8 @@ def train_random(moma_env):
reward_scheme="local",
rradules marked this conversation as resolved.
Show resolved Hide resolved
)

# train_random(ig_env)
train_random(ig_env)
# train_random(mobpd_env)

# train_sa_random(ig_env)
train_sa_random(mobpd_env)
# train_sa_random(mobpd_env)
58 changes: 0 additions & 58 deletions momaland/learning/morl/train_bpd_GPILS.py

This file was deleted.

47 changes: 0 additions & 47 deletions momaland/learning/morl/train_bpd_PCN.py

This file was deleted.

2 changes: 1 addition & 1 deletion momaland/learning/morl/train_ig_GPILS.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
gradient_updates=10,
target_net_update_freq=200,
tau=1,
log=True,
log=False, # set this to True to turn on wandb logging
rradules marked this conversation as resolved.
Show resolved Hide resolved
project_name=project_name,
seed=seed,
)
Expand Down
2 changes: 1 addition & 1 deletion momaland/learning/morl/train_ig_PCN.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
batch_size=256,
project_name=project_name,
experiment_name="PCN",
log=True,
log=False, # set this to True to turn on wandb logging
rradules marked this conversation as resolved.
Show resolved Hide resolved
)
timesteps_per_iter = 10000
agent.train(
Expand Down
Loading