diff --git a/momaland/envs/beach/beach.py b/momaland/envs/beach/beach.py index 6ab2cfcc..4725d678 100644 --- a/momaland/envs/beach/beach.py +++ b/momaland/envs/beach/beach.py @@ -5,6 +5,7 @@ import functools import random +import warnings from typing_extensions import override import numpy as np @@ -69,11 +70,11 @@ class MOBeachDomain(MOParallelEnv, EzPickle): The action space is a Discrete space [0, 1, 2], corresponding to moving left, moving right, staying in place. ## Reward Space - The reward space is a 2D vector containing rewards for two different schemes ('local' or 'global') for: + The reward space is a 2D vector containing rewards for two different modes ('individual' or 'team') for: - the occupation level - the mixture level - If the scheme is 'local', the reward is given for the currently occupied section. - If the scheme is 'global', the reward is summed over all sections. + If the mode is 'individual', the reward is given for the currently occupied section. + If the mode is 'team', the reward is summed over all sections. ## Starting State The initial position is a uniform random distribution of agents over the sections. This can be changed via the @@ -90,7 +91,7 @@ class MOBeachDomain(MOParallelEnv, EzPickle): ## Arguments - 'num_timesteps (int)': number of timesteps in the domain. Default: 1 - 'num_agents (int)': number of agents in the domain. Default: 100 - - 'reward_scheme (str)': the reward scheme to use ('local', or 'global'). Default: local + - 'reward_mode (str)': the reward mode to use ('individual', or 'team'). Default: individual - 'sections (int)': number of beach sections in the domain. Default: 6 - 'capacity (int)': capacity of each beach section. Default: 7 - 'type_distribution (tuple)': the distribution of agent types in the domain. Default: 2 types equally distributed (0.3, 0.7). @@ -104,7 +105,7 @@ def __init__( self, num_timesteps=1, num_agents=100, - reward_scheme="local", + reward_mode="individual", sections=6, capacity=7, type_distribution=(0.3, 0.7), @@ -117,26 +118,29 @@ def __init__( sections: number of beach sections in the domain capacity: capacity of each beach section num_agents: number of agents in the domain + reward_mode: the reward mode to use ('individual', or 'team'). Default: individual type_distribution: the distribution of agent types in the domain. Default: 2 types equally distributed. position_distribution: the initial distribution of agents in the domain. Default: uniform over all sections. num_timesteps: number of timesteps in the domain render_mode: render mode - reward_scheme: the reward scheme to use ('local', or 'global'). Default: local """ EzPickle.__init__( self, num_timesteps, num_agents, - reward_scheme, + reward_mode, sections, capacity, type_distribution, position_distribution, render_mode, ) - self.reward_scheme = reward_scheme + if reward_mode not in ["individual", "team"]: + self.reward_mode = "individual" + warnings.warn("Invalid reward_mode. Must be either 'individual' or 'team'. Defaulting to 'individual'.") + else: + self.reward_mode = reward_mode self.sections = sections - # TODO Extend to distinct capacities per section? self.resource_capacities = [capacity for _ in range(sections)] self.num_timesteps = num_timesteps self.episode_num = 0 @@ -296,13 +300,13 @@ def step(self, actions): reward_per_section = np.zeros((self.sections, NUM_OBJECTIVES), dtype=np.float32) if env_termination: - if self.reward_scheme == "local": + if self.reward_mode == "individual": for i in range(self.sections): lr_capacity = _local_capacity_reward(self.resource_capacities[i], section_consumptions[i]) lr_mixture = _local_mixture_reward(section_agent_types[i]) reward_per_section[i] = np.array([lr_capacity, lr_mixture]) - elif self.reward_scheme == "global": + elif self.reward_mode == "team": g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions) g_mixture = _global_mixture_reward(section_agent_types) reward_per_section = np.array([[g_capacity, g_mixture]] * self.sections) diff --git a/momaland/envs/item_gathering/item_gathering.py b/momaland/envs/item_gathering/item_gathering.py index 77aa8ec6..4eff029a 100644 --- a/momaland/envs/item_gathering/item_gathering.py +++ b/momaland/envs/item_gathering/item_gathering.py @@ -6,18 +6,19 @@ Notes: - In contrast to the original environment, the observation space is a 2D array of integers, i.e., - the map of the environment, where each integer represents either agents (1 for the agent receiving the observation, - 2 for the other agents) or items (3, 4, etc., depending on the number of items). + the map of the environment, with 0 for empty cells, negative integers for agents, positive integers for items. - The number of agents and items is configurable, by providing an initial map. - If no initial map is provided, the environment uses a default map Central observation: - - If the central_observation flag is set to True, then the environment implements: + - If the central_observation flag is set to True, then the environment includes in the implementation: - a central observation space: self.central_observation_space - a central observation function: self.state() + The central_observation flag and the associated methods described above are used by the CentralisedAgent wrapper """ import random +import warnings from copy import deepcopy from os import path from typing_extensions import override @@ -102,6 +103,7 @@ class MOItemGathering(MOParallelEnv, EzPickle): - 'num_timesteps': number of timesteps to run the environment for. Default: 10 - 'initial_map': map of the environment. Default: 8x8 grid, 2 agents, 3 objectives (Källström and Heintz, 2019) - 'randomise': whether to randomise the map, at each episode. Default: False + - 'reward_mode': reward mode for the environment ('individual' or 'team'). Default: 'individual' - 'render_mode': render mode for the environment. Default: None """ @@ -118,6 +120,7 @@ def __init__( num_timesteps=10, initial_map=DEFAULT_MAP, randomise=False, + reward_mode="individual", render_mode=None, ): """Initializes the item gathering domain. @@ -126,6 +129,7 @@ def __init__( num_timesteps: number of timesteps to run the environment for initial_map: map of the environment randomise: whether to randomise the map, at each episode + reward_mode: reward mode for the environment, 'individual' or 'team'. Default: 'individual' render_mode: render mode for the environment """ EzPickle.__init__( @@ -133,12 +137,18 @@ def __init__( num_timesteps, initial_map, randomise, + reward_mode, render_mode, ) self.num_timesteps = num_timesteps self.current_timestep = 0 self.render_mode = render_mode self.randomise = randomise + if reward_mode not in ["individual", "team"]: + self.reward_mode = "individual" + warnings.warn("reward_mode must be either 'individual' or 'team', defaulting to 'individual'.") + else: + self.reward_mode = reward_mode # check if the initial map has any entries equal to 1 assert len(np.argwhere(initial_map == 1).flatten()) > 0, "The initial map does not contain any agents (1s)." @@ -391,6 +401,9 @@ def step(self, actions): if value_in_cell > 0: rewards[self.agents[i]][self.item_dict[value_in_cell]] += 1 self.env_map[self.agent_positions[i][0], self.agent_positions[i][1]] = 0 + # if reward mode is teams, sum the rewards for all agents + if self.reward_mode == "team": + rewards = {agent: np.sum(list(rewards.values()), axis=0) for agent in self.agents} map_obs = self.state() observations = {agent: (-(i + 1), map_obs) for i, agent in enumerate(self.agents)} diff --git a/momaland/learning/iql/tabular_bpd.py b/momaland/learning/iql/tabular_bpd.py index 970c5265..d50a6f8e 100644 --- a/momaland/learning/iql/tabular_bpd.py +++ b/momaland/learning/iql/tabular_bpd.py @@ -65,9 +65,9 @@ def normalize_objective_rewards(self, reward, reward_scheme): np.array: the normalized reward """ # Set the normalization constants - if reward_scheme == "local": + if reward_scheme == "individual": cap_min, cap_max, mix_min, mix_max = self.l_cap_min, self.l_cap_max, self.l_mix_min, self.l_mix_max - elif reward_scheme == "global": + elif reward_scheme == "team": cap_min, cap_max, mix_min, mix_max = self.g_cap_min, self.g_cap_max, self.g_mix_min, self.g_mix_max else: raise ValueError(f"Unknown reward scheme: {reward_scheme}") @@ -108,7 +108,7 @@ def step(self, actions): section_agent_types[self._state[i]][self._types[i]] += 1 g_capacity = _global_capacity_reward(self.resource_capacities, section_consumptions) g_mixture = _global_mixture_reward(section_agent_types) - g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "global") + g_capacity_norm, g_mixture_norm = self.normalize_objective_rewards(np.array([g_capacity, g_mixture]), "team") infos = { agent: {"g_cap": g_capacity, "g_mix": g_mixture, "g_cap_norm": g_capacity_norm, "g_mix_norm": g_mixture_norm} for agent in self.possible_agents @@ -116,7 +116,7 @@ def step(self, actions): # Normalize the rewards for agent in self.possible_agents: - rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_scheme) + rewards[agent] = self.normalize_objective_rewards(rewards[agent], self.reward_mode) return observations, rewards, terminations, truncations, infos diff --git a/momaland/learning/iql/train_iql_bpd.py b/momaland/learning/iql/train_iql_bpd.py index 2f1824be..6fa5c722 100644 --- a/momaland/learning/iql/train_iql_bpd.py +++ b/momaland/learning/iql/train_iql_bpd.py @@ -38,7 +38,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib # Maximum local capacity is achieved when there are 'capacity' agents in the section max_cap_local = _local_capacity_reward(capacity, capacity) cap_min = 0.0 - cap_max = max_cap_local if reward_scheme == "local" else max_cap_global + cap_max = max_cap_local if reward_scheme == "individual" else max_cap_global # Mixture # Maximum global mixture: one agent of each type in each section, except one where all other agents are @@ -52,7 +52,7 @@ def compute_normalization_constants(num_agents, sections, capacity, type_distrib # Maximum local mixture is achieved when there is one agent of each type in the section max_mix_local = _local_mixture_reward([1, 1]) mix_min = 0.0 - mix_max = max_mix_local if reward_scheme == "local" else max_mix_global + mix_max = max_mix_local if reward_scheme == "individual" else max_mix_global return cap_min, cap_max, mix_min, mix_max @@ -96,7 +96,7 @@ def parse_args(): parser.add_argument('--position-distribution', type=float, nargs=5, default=[0., 0.5, 0., 0.5, 0.], ) parser.add_argument('--sections', type=int, default=5, ) parser.add_argument('--capacity', type=int, default=3, ) - parser.add_argument('--reward-scheme', type=str, default="local", help="the reward scheme to use") + parser.add_argument('--reward-scheme', type=str, default="individual", help="the reward scheme to use") args = parser.parse_args() args.time = time.time() @@ -114,13 +114,13 @@ def parse_args(): "position_distribution": args.position_distribution, "sections": args.sections, "capacity": args.capacity, - "reward_scheme": args.reward_scheme, + "reward_mode": args.reward_scheme, # Normalization constants "local_constants": compute_normalization_constants( - args.num_agents, args.sections, args.capacity, args.type_distribution, "local" + args.num_agents, args.sections, args.capacity, args.type_distribution, "individual" ), "global_constants": compute_normalization_constants( - args.num_agents, args.sections, args.capacity, args.type_distribution, "global" + args.num_agents, args.sections, args.capacity, args.type_distribution, "team" ), } diff --git a/momaland/learning/morl/random_centralised_agent_example.py b/momaland/learning/morl/random_centralised_agent_example.py index b14777d7..154c9545 100644 --- a/momaland/learning/morl/random_centralised_agent_example.py +++ b/momaland/learning/morl/random_centralised_agent_example.py @@ -44,6 +44,7 @@ def train_random(moma_env): num_timesteps=50, initial_map=test_map, randomise=True, + reward_mode="individual", render_mode=None, ) @@ -54,11 +55,11 @@ def train_random(moma_env): type_distribution=[0.5, 0.5], position_distribution=[0.5, 1], num_timesteps=10, - reward_scheme="local", + reward_mode="individual", ) - # train_random(ig_env) - # train_random(mobpd_env) + train_random(ig_env) + train_random(mobpd_env) # train_sa_random(ig_env) - train_sa_random(mobpd_env) + # train_sa_random(mobpd_env) diff --git a/momaland/learning/morl/sa_env_factory.py b/momaland/learning/morl/sa_env_factory.py index 6fa48f11..74e5a3b2 100644 --- a/momaland/learning/morl/sa_env_factory.py +++ b/momaland/learning/morl/sa_env_factory.py @@ -34,7 +34,7 @@ def make_single_agent_bpd_env(size="small"): bpd_env = mobeach_v0.parallel_env( num_timesteps=5, num_agents=10, - reward_scheme="global", + reward_mode="team", sections=3, capacity=2, type_distribution=(0.7, 0.3), @@ -44,7 +44,7 @@ def make_single_agent_bpd_env(size="small"): bpd_env = moitem_gathering_v0.parallel_env( num_timesteps=1, num_agents=50, - reward_scheme="global", + reward_mode="team", sections=5, capacity=3, type_distribution=(0.7, 0.3), diff --git a/momaland/learning/morl/train_bpd_GPILS.py b/momaland/learning/morl/train_bpd_GPILS.py deleted file mode 100644 index acfb7cec..00000000 --- a/momaland/learning/morl/train_bpd_GPILS.py +++ /dev/null @@ -1,58 +0,0 @@ -"""MO Gymnasium on centralised agents versions of MOMAland.""" - -import argparse - -import numpy as np -from morl_baselines.multi_policy.gpi_pd.gpi_pd import GPILS - -from momaland.learning.morl.sa_env_factory import make_single_agent_bpd_env - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-seed", type=int, default=42, help="Seed for the agent.") - parser.add_argument("-project", type=str, default="GPI-BPD", help="Project name.") - args = parser.parse_args() - seed = args.seed - obj = 2 - - env = make_single_agent_bpd_env(size="small") - eval_env = make_single_agent_bpd_env(size="small") - project_name = args.project - - ref_point = np.zeros(obj) - - agent = GPILS( - env, - max_grad_norm=None, - learning_rate=3e-4, - gamma=0.99, - batch_size=256, - net_arch=[256, 256], - buffer_size=int(2e5), - initial_epsilon=1.0, - final_epsilon=0.05, - epsilon_decay_steps=5000, - learning_starts=50, - alpha_per=0.6, - min_priority=0.01, - per=False, - use_gpi=True, - gradient_updates=10, - target_net_update_freq=150, - tau=1, - log=False, - project_name=project_name, - seed=seed, - ) - - timesteps_per_iter = 1000 - algo = "gpi-ls" - - agent.train( - total_timesteps=10 * timesteps_per_iter, - eval_env=eval_env, - ref_point=ref_point, - weight_selection_algo=algo, - timesteps_per_iter=timesteps_per_iter, - ) diff --git a/momaland/learning/morl/train_bpd_PCN.py b/momaland/learning/morl/train_bpd_PCN.py deleted file mode 100644 index e3d9b676..00000000 --- a/momaland/learning/morl/train_bpd_PCN.py +++ /dev/null @@ -1,47 +0,0 @@ -"""MO Gymnasium on centralised agents versions of MOMAland.""" - -import argparse - -import numpy as np -from morl_baselines.multi_policy.pcn.pcn import PCN - -from momaland.learning.morl.sa_env_factory import make_single_agent_bpd_env - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-seed", type=int, default=42, help="Seed for the agent.") - parser.add_argument("-project", type=str, default="PCN-BPD", help="Project name.") - args = parser.parse_args() - seed = args.seed - obj = 2 - - env = make_single_agent_bpd_env(size="small") - eval_env = make_single_agent_bpd_env(size="small") - project_name = args.project - - ref_point = np.zeros(obj) - max_return = np.array([3.5, 1.5]) - print("Reference point: ", ref_point) - - agent = PCN( - env, - seed=seed, - gamma=1, - scaling_factor=np.ones(obj + 1), - learning_rate=1e-3, - hidden_dim=512, - batch_size=256, - project_name=project_name, - experiment_name="PCN", - log=True, - ) - timesteps_per_iter = 1000 - agent.train( - eval_env=eval_env, - total_timesteps=10 * timesteps_per_iter, - ref_point=ref_point, - num_er_episodes=20, - num_model_updates=50, - max_return=max_return, - )