From 9c8eb1b989b8dec66c9ac314f8507e8c931145e7 Mon Sep 17 00:00:00 2001 From: umutucak Date: Mon, 23 Oct 2023 13:56:26 +0200 Subject: [PATCH] escort & catch implemented + reward space fixed --- .../envs/crazyrl/catch/__init__.py | 2 + momadm_benchmarks/envs/crazyrl/catch/catch.py | 305 ++++++++++++++++++ .../envs/crazyrl/catch/catch_v0.py | 5 + .../envs/crazyrl/escort/__init__.py | 2 + .../envs/crazyrl/escort/escort.py | 302 +++++++++++++++++ .../envs/crazyrl/escort/escort_v0.py | 5 + .../envs/crazyrl/surround/__init__.py | 2 + .../envs/crazyrl/surround/surround.py | 18 +- 8 files changed, 626 insertions(+), 15 deletions(-) create mode 100644 momadm_benchmarks/envs/crazyrl/catch/__init__.py create mode 100644 momadm_benchmarks/envs/crazyrl/catch/catch.py create mode 100644 momadm_benchmarks/envs/crazyrl/catch/catch_v0.py create mode 100644 momadm_benchmarks/envs/crazyrl/escort/__init__.py create mode 100644 momadm_benchmarks/envs/crazyrl/escort/escort.py create mode 100644 momadm_benchmarks/envs/crazyrl/escort/escort_v0.py create mode 100644 momadm_benchmarks/envs/crazyrl/surround/__init__.py diff --git a/momadm_benchmarks/envs/crazyrl/catch/__init__.py b/momadm_benchmarks/envs/crazyrl/catch/__init__.py new file mode 100644 index 00000000..32b55e41 --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/catch/__init__.py @@ -0,0 +1,2 @@ +"""Catch environment for multi-agent reinforcement learning.""" +from momadm_benchmarks.envs.crazyrl.catch import catch_v0 diff --git a/momadm_benchmarks/envs/crazyrl/catch/catch.py b/momadm_benchmarks/envs/crazyrl/catch/catch.py new file mode 100644 index 00000000..3f23e12d --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/catch/catch.py @@ -0,0 +1,305 @@ +"""Catch environment for Crazyflie 2. Each agent is supposed to learn to surround a common target point trying to escape.""" + +import time +from typing import Optional +from typing_extensions import override + +import numpy as np +from gymnasium import spaces + +from momadm_benchmarks.envs.crazyrl.crazyRL_base import ( + CLOSENESS_THRESHOLD, + MOBaseParallelEnv, + _distance_to_target, +) +from momadm_benchmarks.utils.conversions import mo_parallel_to_aec + + +def env(*args, **kwargs): + """Returns the wrapped environment in `AEC` format. + + Args: + **kwargs: keyword args to forward to the raw_env function. + + Returns: + A fully wrapped AEC env. + """ + env = raw_env(*args, **kwargs) + env = mo_parallel_to_aec(env) + return env + + +def parallel_env(*args, **kwargs): + """Returns the wrapped env in `parallel` format. + + Args: + **kwargs: keyword args to forward to the raw_env function. + + Returns: + A fully wrapped parallel env. + """ + env = raw_env(*args, **kwargs) + return env + + +def raw_env(*args, **kwargs): + """Returns the environment in `Parallel` format. + + Args: + **kwargs: keyword args to forward to create the `MOMultiwalker` environment. + + Returns: + A raw env. + """ + return Catch(*args, **kwargs) + + +class Catch(MOBaseParallelEnv): + """A Parallel Environment where drone learn how to surround a moving target trying to escape.""" + + metadata = {"render_modes": ["human"], "is_parallelizable": True, "render_fps": 20} + + def __init__( + self, + drone_ids: np.ndarray, + init_flying_pos: np.ndarray, + init_target_location: np.ndarray, + target_speed: float, + target_id: Optional[int] = None, + render_mode=None, + size: int = 2, + multi_obj: bool = True, + ): + """Catch environment for Crazyflies 2. + + Args: + drone_ids: Array of drone ids + init_flying_pos: Array of initial positions of the drones when they are flying + init_target_location: Array of the initial position of the moving target + target_speed: Distance traveled by the target at each timestep + target_id: Target id if you want a real target + render_mode: Render mode: "human", "real" or None + size: Size of the map + multi_obj: Whether to return a multi-objective reward + """ + self.num_drones = len(drone_ids) + + self._agent_location = dict() + + self._target_location = {"unique": init_target_location} # unique target location for all agents + + self.target_speed = target_speed + + self._init_flying_pos = dict() + self._agents_names = np.array(["agent_" + str(i) for i in drone_ids]) + self.timestep = 0 + + for i, agent in enumerate(self._agents_names): + self._init_flying_pos[agent] = init_flying_pos[i].copy() + + self._agent_location = self._init_flying_pos.copy() + self.multi_obj = multi_obj + self.size = size + + super().__init__( + render_mode=render_mode, + size=size, + init_flying_pos=self._init_flying_pos, + target_location=self._target_location, + agents_names=self._agents_names, + drone_ids=drone_ids, + target_id=target_id, + ) + + @override + def _observation_space(self, agent): + return spaces.Box( + low=np.tile(np.array([-self.size, -self.size, 0], dtype=np.float32), self.num_drones + 1), + high=np.tile(np.array([self.size, self.size, 3], dtype=np.float32), self.num_drones + 1), + shape=(3 * (self.num_drones + 1),), # coordinates of the drones and the target + dtype=np.float32, + ) + + @override + def _action_space(self, agent): + return spaces.Box(low=-1 * np.ones(3, dtype=np.float32), high=np.ones(3, dtype=np.float32), dtype=np.float32) + + @override + def _compute_obs(self): + obs = dict() + + for agent in self._agents_names: + obs[agent] = self._agent_location[agent].copy() + obs[agent] = np.append(obs[agent], self._target_location["unique"]) + + for other_agent in self._agents_names: + if other_agent != agent: + obs[agent] = np.append(obs[agent], self._agent_location[other_agent]) + + return obs + + def _move_target(self): + # mean of the agent's positions + mean = np.array([0, 0, 0]) + for agent in self.agents: + mean = mean + self._agent_location[agent] + + mean = mean / self.num_drones + + dist = np.linalg.norm(mean - self._target_location["unique"]) + self._target_location["unique"] = self._target_location["unique"].copy() + + # go to the opposite direction of the mean of the agents + if dist > 0.2: + self._target_location["unique"] += (self._target_location["unique"] - mean) / dist * self.target_speed + + # if the mean of the agents is too close to the target, move the target in a random direction, slowly because + # it hesitates + else: + self._target_location["unique"] += np.random.random_sample(3) * self.target_speed * 0.1 + + # if the target is out of the map, put it back in the map + np.clip( + self._target_location["unique"], + [-self.size, -self.size, 0.2], + [self.size, self.size, 3], + out=self._target_location["unique"], + ) + + @override + def _transition_state(self, actions): + target_point_action = dict() + state = self._agent_location + + # new targets + self._previous_target = self._target_location.copy() + self._move_target() + + for agent in self.agents: + # Actions are clipped to stay in the map and scaled to do max 20cm in one step + target_point_action[agent] = np.clip( + state[agent] + actions[agent] * 0.2, [-self.size, -self.size, 0], [self.size, self.size, 3] + ) + + return target_point_action + + @override + def _compute_reward(self): + # Reward is the mean distance to the other agents minus the distance to the target + reward = dict() + + for agent in self._agents_names: + reward_far_from_other_agents = 0 + reward_close_to_target = 0 + + # mean distance to the other agents + for other_agent in self._agents_names: + if other_agent != agent: + reward_far_from_other_agents += np.linalg.norm( + self._agent_location[agent] - self._agent_location[other_agent] + ) + + reward_far_from_other_agents /= self.num_drones - 1 + + # distance to the target + # (!) targets and locations must be updated before this + dist_from_old_target = _distance_to_target(self._agent_location[agent], self._previous_target["unique"]) + old_dist = _distance_to_target(self._previous_location[agent], self._previous_target["unique"]) + + # reward should be new_potential - old_potential but since the distances should be negated we reversed the signs + # -new_potential - (-old_potential) = old_potential - new_potential + reward_close_to_target = old_dist - dist_from_old_target + + # collision between two drones + for other_agent in self._agents_names: + if other_agent != agent and ( + np.linalg.norm(self._agent_location[agent] - self._agent_location[other_agent]) < CLOSENESS_THRESHOLD + ): + reward_far_from_other_agents = -10 + reward_close_to_target = -10 + + # collision with the ground or the target + if ( + self._agent_location[agent][2] < CLOSENESS_THRESHOLD + or np.linalg.norm(self._agent_location[agent] - self._target_location["unique"]) < CLOSENESS_THRESHOLD + ): + reward_far_from_other_agents = -10 + reward_close_to_target = -10 + + if self.multi_obj: + reward[agent] = np.array([reward_close_to_target, reward_far_from_other_agents]) + else: + # MO reward linearly combined using hardcoded weights + reward[agent] = 0.9995 * reward_close_to_target + 0.0005 * reward_far_from_other_agents + + return reward + + @override + def _compute_terminated(self): + terminated = dict() + + for agent in self.agents: + terminated[agent] = False + + for agent in self.agents: + # collision between two drones + for other_agent in self.agents: + if other_agent != agent: + terminated[agent] = terminated[agent] or ( + np.linalg.norm(self._agent_location[agent] - self._agent_location[other_agent]) < CLOSENESS_THRESHOLD + ) + + # collision with the ground + terminated[agent] = terminated[agent] or (self._agent_location[agent][2] < CLOSENESS_THRESHOLD) + + # collision with the target + terminated[agent] = terminated[agent] or ( + np.linalg.norm(self._agent_location[agent] - self._target_location["unique"]) < CLOSENESS_THRESHOLD + ) + + if terminated[agent] and self.render_mode == "human": + for other_agent in self.agents: + terminated[other_agent] = True + self.agents = [] + + return terminated + + @override + def _compute_truncation(self): + if self.timestep == 200: + truncation = {agent: True for agent in self._agents_names} + self.agents = [] + self.timestep = 0 + else: + truncation = {agent: False for agent in self._agents_names} + return truncation + + @override + def _compute_info(self): + info = dict() + return info + + @override + def state(self): + return np.append(np.array(list(self._agent_location.values())).flatten(), self._target_location["unique"]) + + +if __name__ == "__main__": + prll_env = Catch( + drone_ids=np.array([0, 1, 2, 3]), + render_mode="human", + init_flying_pos=np.array([[0, 0, 1], [1, 1, 1], [0, 1, 1], [2, 2, 1]]), + init_target_location=np.array([1, 1, 2.5]), + target_speed=0.1, + ) + + observations, infos = prll_env.reset() + + while prll_env.agents: + actions = { + agent: prll_env.action_space(agent).sample() for agent in prll_env.agents + } # this is where you would insert your policy + observations, rewards, terminations, truncations, infos = prll_env.step(actions) + prll_env.render() + print("obs", observations, "reward", rewards) + time.sleep(0.02) diff --git a/momadm_benchmarks/envs/crazyrl/catch/catch_v0.py b/momadm_benchmarks/envs/crazyrl/catch/catch_v0.py new file mode 100644 index 00000000..7de8b41e --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/catch/catch_v0.py @@ -0,0 +1,5 @@ +"""CrazyRL/Catch environment for MOMARL.""" +from momadm_benchmarks.envs.crazyrl.catch.catch import env, parallel_env, raw_env + + +__all__ = ["env", "parallel_env", "raw_env"] diff --git a/momadm_benchmarks/envs/crazyrl/escort/__init__.py b/momadm_benchmarks/envs/crazyrl/escort/__init__.py new file mode 100644 index 00000000..6ed70de7 --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/escort/__init__.py @@ -0,0 +1,2 @@ +"""Escort environment for multi-agent reinforcement learning.""" +from momadm_benchmarks.envs.crazyrl.escort import escort_v0 diff --git a/momadm_benchmarks/envs/crazyrl/escort/escort.py b/momadm_benchmarks/envs/crazyrl/escort/escort.py new file mode 100644 index 00000000..134bbaa4 --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/escort/escort.py @@ -0,0 +1,302 @@ +"""Escort environment for Crazyflie 2. Each agent is supposed to learn to surround a common target point moving to one point to another.""" + +import time +from typing import Optional +from typing_extensions import override + +import numpy as np +import numpy.typing as npt +from gymnasium import spaces + +from momadm_benchmarks.envs.crazyrl.crazyRL_base import ( + CLOSENESS_THRESHOLD, + MOBaseParallelEnv, + _distance_to_target, +) +from momadm_benchmarks.utils.conversions import mo_parallel_to_aec + + +def env(*args, **kwargs): + """Returns the wrapped environment in `AEC` format. + + Args: + **kwargs: keyword args to forward to the raw_env function. + + Returns: + A fully wrapped AEC env. + """ + env = raw_env(*args, **kwargs) + env = mo_parallel_to_aec(env) + return env + + +def parallel_env(*args, **kwargs): + """Returns the wrapped env in `parallel` format. + + Args: + **kwargs: keyword args to forward to the raw_env function. + + Returns: + A fully wrapped parallel env. + """ + env = raw_env(*args, **kwargs) + return env + + +def raw_env(*args, **kwargs): + """Returns the environment in `Parallel` format. + + Args: + **kwargs: keyword args to forward to create the `MOMultiwalker` environment. + + Returns: + A raw env. + """ + return Escort(*args, **kwargs) + + +class Escort(MOBaseParallelEnv): + """A Parallel Environment where drone learn how to surround a moving target, going straight to one point to another.""" + + metadata = {"render_modes": ["human"], "is_parallelizable": True, "render_fps": 20} + + def __init__( + self, + drone_ids: npt.NDArray[np.int32], + init_flying_pos: npt.NDArray[np.int32], + init_target_location: npt.NDArray[np.int32], + final_target_location: npt.NDArray[np.int32], + target_id: Optional[int] = None, + num_intermediate_points: int = 50, + render_mode=None, + size: int = 2, + multi_obj: bool = True, + ): + """Escort environment for Crazyflies 2. + + Args: + drone_ids: Array of drone ids + init_flying_pos: Array of initial positions of the drones when they are flying + init_target_location: Array of the initial position of the moving target + final_target_location: Array of the final position of the moving target + target_id: target id if you want a real drone target + num_intermediate_points: Number of intermediate points in the target trajectory + render_mode: Render mode: "human", or None + size: Size of the map + multi_obj: Whether to return a multi-objective reward + """ + self.num_drones = len(drone_ids) + self._agent_location = dict() + self._target_location = {"unique": init_target_location} # unique target location for all agents + self._init_flying_pos = dict() + self._agents_names = np.array(["agent_" + str(i) for i in drone_ids]) + self.timestep = 0 + + # There are two more ref points than intermediate points, one for the initial and final target locations + self.num_ref_points = num_intermediate_points + 2 + # Ref is a 2d arrays for the target + # it contains the reference points (xyz) for the target at each timestep + self.ref: np.ndarray = np.array([init_target_location]) + + for i, agent in enumerate(self._agents_names): + self._init_flying_pos[agent] = init_flying_pos[i].copy() + + for t in range(1, self.num_ref_points): + self.ref = np.append( + self.ref, + [init_target_location + (final_target_location - init_target_location) * t / self.num_ref_points], + axis=0, + ) + + self._agent_location = self._init_flying_pos.copy() + + self.size = size + self.multi_obj = multi_obj + super().__init__( + render_mode=render_mode, + size=size, + init_flying_pos=self._init_flying_pos, + target_location=self._target_location, + agents_names=self._agents_names, + drone_ids=drone_ids, + target_id=target_id, + ) + + @override + def _observation_space(self, agent): + return spaces.Box( + low=np.tile(np.array([-self.size, -self.size, 0], dtype=np.float32), self.num_drones + 1), + high=np.tile(np.array([self.size, self.size, 3], dtype=np.float32), self.num_drones + 1), + shape=(3 * (self.num_drones + 1),), # coordinates of the drones and the target + dtype=np.float32, + ) + + @override + def _action_space(self, agent): + return spaces.Box(low=-1 * np.ones(3, dtype=np.float32), high=np.ones(3, dtype=np.float32), dtype=np.float32) + + @override + def _reward_space(self, agent): + if self.multi_obj: + return spaces.Box( + low=np.array([-10, -10], dtype=np.float32), + high=np.array([1, np.inf], dtype=np.float32), + shape=(2,), + dtype=np.float32, + ) + else: + return None + + @override + def _compute_obs(self): + obs = dict() + for agent in self._agents_names: + obs[agent] = self._agent_location[agent].copy() + obs[agent] = np.append(obs[agent], self._target_location["unique"]) + + for other_agent in self._agents_names: + if other_agent != agent: + obs[agent] = np.append(obs[agent], self._agent_location[other_agent]) + + return obs + + @override + def _transition_state(self, actions): + target_point_action = dict() + state = self._agent_location + # new targets + self._previous_target = self._target_location.copy() + if self.timestep < self.num_ref_points: + self._target_location["unique"] = self.ref[self.timestep] + else: # the target has stopped + self._target_location["unique"] = self.ref[-1] + + for agent in self.agents: + # Actions are clipped to stay in the map and scaled to do max 20cm in one step + target_point_action[agent] = np.clip( + state[agent] + actions[agent] * 0.2, [-self.size, -self.size, 0], [self.size, self.size, 3] + ) + + return target_point_action + + @override + def _compute_reward(self): + # Reward is the mean distance to the other agents minus the distance to the target + reward = dict() + + for agent in self._agents_names: + reward_far_from_other_agents = 0 + reward_close_to_target = 0 + + # mean distance to the other agents + for other_agent in self._agents_names: + if other_agent != agent: + reward_far_from_other_agents += np.linalg.norm( + self._agent_location[agent] - self._agent_location[other_agent] + ) + + reward_far_from_other_agents /= self.num_drones - 1 + + # distance to the target + # (!) targets and locations must be updated before this + dist_from_old_target = _distance_to_target(self._agent_location[agent], self._previous_target["unique"]) + old_dist = _distance_to_target(self._previous_location[agent], self._previous_target["unique"]) + + # reward should be new_potential - old_potential but since the distances should be negated we reversed the signs + # -new_potential - (-old_potential) = old_potential - new_potential + reward_close_to_target = old_dist - dist_from_old_target + + # collision between two drones + for other_agent in self._agents_names: + if other_agent != agent and ( + np.linalg.norm(self._agent_location[agent] - self._agent_location[other_agent]) < CLOSENESS_THRESHOLD + ): + reward_far_from_other_agents = -10 + reward_close_to_target = -10 + + # collision with the ground or the target + if ( + self._agent_location[agent][2] < CLOSENESS_THRESHOLD + or np.linalg.norm(self._agent_location[agent] - self._target_location["unique"]) < CLOSENESS_THRESHOLD + ): + reward_far_from_other_agents = -10 + reward_close_to_target = -10 + + if self.multi_obj: + reward[agent] = np.array([reward_close_to_target, reward_far_from_other_agents]) + else: + # MO reward linearly combined using hardcoded weights + reward[agent] = 0.9995 * reward_close_to_target + 0.0005 * reward_far_from_other_agents + + return reward + + @override + def _compute_terminated(self): + terminated = dict() + + for agent in self.agents: + terminated[agent] = False + + for agent in self.agents: + # collision between two drones + for other_agent in self.agents: + if other_agent != agent: + terminated[agent] = terminated[agent] or ( + np.linalg.norm(self._agent_location[agent] - self._agent_location[other_agent]) < CLOSENESS_THRESHOLD + ) + + # collision with the ground + terminated[agent] = terminated[agent] or (self._agent_location[agent][2] < CLOSENESS_THRESHOLD) + + # collision with the target + terminated[agent] = terminated[agent] or ( + np.linalg.norm(self._agent_location[agent] - self._target_location["unique"]) < CLOSENESS_THRESHOLD + ) + + if terminated[agent] and self.render_mode == "human": + for other_agent in self.agents: + terminated[other_agent] = True + self.agents = [] + + return terminated + + @override + def _compute_truncation(self): + if self.timestep == 200: + truncation = {agent: True for agent in self._agents_names} + self.agents = [] + self.timestep = 0 + else: + truncation = {agent: False for agent in self._agents_names} + return truncation + + @override + def _compute_info(self): + info = dict() + return info + + @override + def state(self): + return np.append(np.array(list(self._agent_location.values())).flatten(), self._target_location["unique"]) + + +if __name__ == "__main__": + prll_env = Escort( + drone_ids=np.array([0, 1, 2, 3]), + render_mode="human", + init_flying_pos=np.array([[0, 0, 1], [1, 1, 1], [0, 1, 1], [2, 2, 1]]), + init_target_location=np.array([1, 1, 2.5]), + final_target_location=np.array([-2, -2, 3]), + num_intermediate_points=150, + ) + + observations, infos = prll_env.reset() + + while prll_env.agents: + actions = { + agent: prll_env.action_space(agent).sample() for agent in prll_env.agents + } # this is where you would insert your policy + observations, rewards, terminations, truncations, infos = prll_env.step(actions) + prll_env.render() + print("obs", observations, "reward", rewards) + time.sleep(0.02) diff --git a/momadm_benchmarks/envs/crazyrl/escort/escort_v0.py b/momadm_benchmarks/envs/crazyrl/escort/escort_v0.py new file mode 100644 index 00000000..f8b6927c --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/escort/escort_v0.py @@ -0,0 +1,5 @@ +"""CrazyRL/Escort environment for MOMARL.""" +from momadm_benchmarks.envs.crazyrl.escort.escort import env, parallel_env, raw_env + + +__all__ = ["env", "parallel_env", "raw_env"] diff --git a/momadm_benchmarks/envs/crazyrl/surround/__init__.py b/momadm_benchmarks/envs/crazyrl/surround/__init__.py new file mode 100644 index 00000000..45e8fc30 --- /dev/null +++ b/momadm_benchmarks/envs/crazyrl/surround/__init__.py @@ -0,0 +1,2 @@ +"""Surround environment for multi-agent reinforcement learning.""" +from momadm_benchmarks.envs.crazyrl.surround import surround_v0 diff --git a/momadm_benchmarks/envs/crazyrl/surround/surround.py b/momadm_benchmarks/envs/crazyrl/surround/surround.py index 2e6cb14f..3adc2ad8 100644 --- a/momadm_benchmarks/envs/crazyrl/surround/surround.py +++ b/momadm_benchmarks/envs/crazyrl/surround/surround.py @@ -57,7 +57,7 @@ def raw_env(*args, **kwargs): class Surround(MOBaseParallelEnv): """A Parallel Environment where drone learn how to surround a target point.""" - metadata = {"render_modes": ["human", "real"], "is_parallelizable": True, "render_fps": 20} + metadata = {"render_modes": ["human"], "is_parallelizable": True, "render_fps": 20} def __init__( self, @@ -67,7 +67,7 @@ def __init__( target_id: Optional[int] = None, render_mode=None, size: int = 2, - multi_obj: bool = False, + multi_obj: bool = True, ): """Surround environment for Crazyflies 2. @@ -76,7 +76,7 @@ def __init__( init_flying_pos: Array of initial positions of the drones when they are flying target_location: Array of the position of the target point target_id: Target id if you want a real drone target - render_mode: Render mode: "human", "real" or None + render_mode: Render mode: "human" or None size: Size of the map multi_obj: Whether to return a multi-objective reward """ @@ -117,18 +117,6 @@ def _observation_space(self, agent): def _action_space(self, agent): return spaces.Box(low=-1 * np.ones(3, dtype=np.float32), high=np.ones(3, dtype=np.float32), dtype=np.float32) - @override - def _reward_space(self, agent): - if self.multi_obj: - return spaces.Box( - low=np.array([-self.size, -self.size, 0], dtype=np.float32), - high=np.array([-self.size, -self.size, 0], dtype=np.float32), - shape=(2,), - dtype=np.float32, - ) - else: - return None - @override def _compute_obs(self): obs = dict()