From 14def0759fa094e9a4a6fa0d7b4d66719a11007b Mon Sep 17 00:00:00 2001 From: Kallinteris Andreas <30759571+Kallinteris-Andreas@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:52:07 +0200 Subject: [PATCH] [MuJoCo] factorize `_get_rew()` out of `step()` (#819) --- gymnasium/envs/mujoco/ant_v5.py | 31 ++++++++++------ gymnasium/envs/mujoco/half_cheetah_v5.py | 21 ++++++----- gymnasium/envs/mujoco/hopper_v5.py | 30 ++++++++++------ gymnasium/envs/mujoco/humanoid_v5.py | 36 +++++++++++-------- gymnasium/envs/mujoco/humanoidstandup_v5.py | 27 +++++++++----- .../mujoco/inverted_double_pendulum_v5.py | 19 ++++++---- gymnasium/envs/mujoco/pusher_v5.py | 21 +++++++---- gymnasium/envs/mujoco/reacher_v5.py | 21 +++++++---- gymnasium/envs/mujoco/swimmer_v5.py | 22 ++++++++---- gymnasium/envs/mujoco/walker2d_v5.py | 31 +++++++++------- 10 files changed, 165 insertions(+), 94 deletions(-) diff --git a/gymnasium/envs/mujoco/ant_v5.py b/gymnasium/envs/mujoco/ant_v5.py index 5f15ad2cd..8b3ab177e 100644 --- a/gymnasium/envs/mujoco/ant_v5.py +++ b/gymnasium/envs/mujoco/ant_v5.py @@ -376,6 +376,23 @@ def step(self, action): xy_velocity = (xy_position_after - xy_position_before) / self.dt x_velocity, y_velocity = xy_velocity + observation = self._get_obs() + reward, reward_info = self._get_rew(x_velocity, action) + terminated = self.terminated + info = { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), + "x_velocity": x_velocity, + "y_velocity": y_velocity, + **reward_info, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + + def _get_rew(self, x_velocity: float, action): forward_reward = x_velocity * self._forward_reward_weight healthy_reward = self.healthy_reward rewards = forward_reward + healthy_reward @@ -384,24 +401,16 @@ def step(self, action): contact_cost = self.contact_cost costs = ctrl_cost + contact_cost - observation = self._get_obs() reward = rewards - costs - terminated = self.terminated - info = { + + reward_info = { "reward_forward": forward_reward, "reward_ctrl": -ctrl_cost, "reward_contact": -contact_cost, "reward_survive": healthy_reward, - "x_position": self.data.qpos[0], - "y_position": self.data.qpos[1], - "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), - "x_velocity": x_velocity, - "y_velocity": y_velocity, } - if self.render_mode == "human": - self.render() - return observation, reward, terminated, False, info + return reward, reward_info def _get_obs(self): position = self.data.qpos.flatten() diff --git a/gymnasium/envs/mujoco/half_cheetah_v5.py b/gymnasium/envs/mujoco/half_cheetah_v5.py index 6499ca7fb..0649f4503 100644 --- a/gymnasium/envs/mujoco/half_cheetah_v5.py +++ b/gymnasium/envs/mujoco/half_cheetah_v5.py @@ -245,22 +245,25 @@ def step(self, action): x_position_after = self.data.qpos[0] x_velocity = (x_position_after - x_position_before) / self.dt - ctrl_cost = self.control_cost(action) + observation = self._get_obs() + reward, reward_info = self._get_rew(x_velocity, action) + info = {"x_position": x_position_after, "x_velocity": x_velocity, **reward_info} + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def _get_rew(self, x_velocity: float, action): forward_reward = self._forward_reward_weight * x_velocity + ctrl_cost = self.control_cost(action) - observation = self._get_obs() reward = forward_reward - ctrl_cost - info = { - "x_position": x_position_after, - "x_velocity": x_velocity, + + reward_info = { "reward_forward": forward_reward, "reward_ctrl": -ctrl_cost, } - - if self.render_mode == "human": - self.render() - return observation, reward, False, False, info + return reward, reward_info def _get_obs(self): position = self.data.qpos.flatten() diff --git a/gymnasium/envs/mujoco/hopper_v5.py b/gymnasium/envs/mujoco/hopper_v5.py index a1f1086fc..68813d190 100644 --- a/gymnasium/envs/mujoco/hopper_v5.py +++ b/gymnasium/envs/mujoco/hopper_v5.py @@ -316,29 +316,37 @@ def step(self, action): x_position_after = self.data.qpos[0] x_velocity = (x_position_after - x_position_before) / self.dt - ctrl_cost = self.control_cost(action) + observation = self._get_obs() + reward, reward_info = self._get_rew(x_velocity, action) + terminated = self.terminated + info = { + "x_position": x_position_after, + "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], + "x_velocity": x_velocity, + **reward_info, + } + + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + def _get_rew(self, x_velocity: float, action): forward_reward = self._forward_reward_weight * x_velocity healthy_reward = self.healthy_reward - rewards = forward_reward + healthy_reward + + ctrl_cost = self.control_cost(action) costs = ctrl_cost - observation = self._get_obs() reward = rewards - costs - terminated = self.terminated - info = { + + reward_info = { "reward_forward": forward_reward, "reward_ctrl": -ctrl_cost, "reward_survive": healthy_reward, - "x_position": x_position_after, - "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], - "x_velocity": x_velocity, } - if self.render_mode == "human": - self.render() - return observation, reward, terminated, False, info + return reward, reward_info def reset_model(self): noise_low = -self._reset_noise_scale diff --git a/gymnasium/envs/mujoco/humanoid_v5.py b/gymnasium/envs/mujoco/humanoid_v5.py index b0f078ec0..1834d6d48 100644 --- a/gymnasium/envs/mujoco/humanoid_v5.py +++ b/gymnasium/envs/mujoco/humanoid_v5.py @@ -497,23 +497,10 @@ def step(self, action): xy_velocity = (xy_position_after - xy_position_before) / self.dt x_velocity, y_velocity = xy_velocity - ctrl_cost = self.control_cost(action) - contact_cost = self.contact_cost - costs = ctrl_cost + contact_cost - - forward_reward = self._forward_reward_weight * x_velocity - healthy_reward = self.healthy_reward - - rewards = forward_reward + healthy_reward - observation = self._get_obs() - reward = rewards - costs + reward, reward_info = self._get_rew(x_velocity, action) terminated = self.terminated info = { - "reward_survive": healthy_reward, - "reward_forward": forward_reward, - "reward_ctrl": -ctrl_cost, - "reward_contact": -contact_cost, "x_position": self.data.qpos[0], "y_position": self.data.qpos[1], "tendon_lenght": self.data.ten_length, @@ -521,12 +508,33 @@ def step(self, action): "distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2), "x_velocity": x_velocity, "y_velocity": y_velocity, + **reward_info, } if self.render_mode == "human": self.render() return observation, reward, terminated, False, info + def _get_rew(self, x_velocity: float, action): + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + rewards = forward_reward + healthy_reward + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + costs = ctrl_cost + contact_cost + + reward = rewards - costs + + reward_info = { + "reward_survive": healthy_reward, + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_contact": -contact_cost, + } + + return reward, reward_info + def reset_model(self): noise_low = -self._reset_noise_scale noise_high = self._reset_noise_scale diff --git a/gymnasium/envs/mujoco/humanoidstandup_v5.py b/gymnasium/envs/mujoco/humanoidstandup_v5.py index 46e591ac8..99b35cc50 100644 --- a/gymnasium/envs/mujoco/humanoidstandup_v5.py +++ b/gymnasium/envs/mujoco/humanoidstandup_v5.py @@ -444,6 +444,21 @@ def step(self, action): self.do_simulation(action, self.frame_skip) pos_after = self.data.qpos[2] + reward, reward_info = self._get_rew(pos_after, action) + info = { + "x_position": self.data.qpos[0], + "y_position": self.data.qpos[1], + "z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2], + "tendon_lenght": self.data.ten_length, + "tendon_velocity": self.data.ten_velocity, + **reward_info, + } + + if self.render_mode == "human": + self.render() + return self._get_obs(), reward, False, False, info + + def _get_rew(self, pos_after: float, action): uph_cost = (pos_after - 0) / self.model.opt.timestep quad_ctrl_cost = self._ctrl_cost_weight * np.square(self.data.ctrl).sum() @@ -455,20 +470,14 @@ def step(self, action): quad_impact_cost = np.clip(quad_impact_cost, min_impact_cost, max_impact_cost) reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1 - info = { + + reward_info = { "reward_linup": uph_cost, "reward_quadctrl": -quad_ctrl_cost, "reward_impact": -quad_impact_cost, - "x_position": self.data.qpos[0], - "y_position": self.data.qpos[1], - "z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2], - "tendon_lenght": self.data.ten_length, - "tendon_velocity": self.data.ten_velocity, } - if self.render_mode == "human": - self.render() - return self._get_obs(), reward, False, False, info + return reward, reward_info def reset_model(self): noise_low = -self._reset_noise_scale diff --git a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py index dfa190ca2..bf2835577 100644 --- a/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py +++ b/gymnasium/envs/mujoco/inverted_double_pendulum_v5.py @@ -194,27 +194,32 @@ def __init__( def step(self, action): self.do_simulation(action, self.frame_skip) + x, _, y = self.data.site_xpos[0] observation = self._get_obs() + terminated = bool(y <= 1) + reward, reward_info = self._get_rew(x, y, terminated) - x, _, y = self.data.site_xpos[0] - v1, v2 = self.data.qvel[1:3] + info = reward_info - terminated = bool(y <= 1) + if self.render_mode == "human": + self.render() + return observation, reward, terminated, False, info + def _get_rew(self, x, y, terminated): + v1, v2 = self.data.qvel[1:3] dist_penalty = 0.01 * x**2 + (y - 2) ** 2 vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 alive_bonus = self._healthy_reward * int(not terminated) + reward = alive_bonus - dist_penalty - vel_penalty - info = { + reward_info = { "reward_survive": alive_bonus, "distance_penalty": -dist_penalty, "velocity_penalty": -vel_penalty, } - if self.render_mode == "human": - self.render() - return observation, reward, terminated, False, info + return reward, reward_info def _get_obs(self): return np.concatenate( diff --git a/gymnasium/envs/mujoco/pusher_v5.py b/gymnasium/envs/mujoco/pusher_v5.py index 99d4eabf0..490c4b016 100644 --- a/gymnasium/envs/mujoco/pusher_v5.py +++ b/gymnasium/envs/mujoco/pusher_v5.py @@ -220,6 +220,16 @@ def __init__( } def step(self, action): + reward, reward_info = self._get_rew(action) + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + info = reward_info + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def _get_rew(self, action): vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm") vec_2 = self.get_body_com("object") - self.get_body_com("goal") @@ -227,18 +237,15 @@ def step(self, action): reward_dist = -np.linalg.norm(vec_2) * self._reward_dist_weight reward_ctrl = -np.square(action).sum() * self._reward_control_weight - self.do_simulation(action, self.frame_skip) - - observation = self._get_obs() reward = reward_dist + reward_ctrl + reward_near - info = { + + reward_info = { "reward_dist": reward_dist, "reward_ctrl": reward_ctrl, "reward_near": reward_near, } - if self.render_mode == "human": - self.render() - return observation, reward, False, False, info + + return reward, reward_info def reset_model(self): qpos = self.init_qpos diff --git a/gymnasium/envs/mujoco/reacher_v5.py b/gymnasium/envs/mujoco/reacher_v5.py index e8ba867d1..db23e1961 100644 --- a/gymnasium/envs/mujoco/reacher_v5.py +++ b/gymnasium/envs/mujoco/reacher_v5.py @@ -197,21 +197,28 @@ def __init__( } def step(self, action): + reward, reward_info = self._get_rew(action) + self.do_simulation(action, self.frame_skip) + + observation = self._get_obs() + info = reward_info + if self.render_mode == "human": + self.render() + return observation, reward, False, False, info + + def _get_rew(self, action): vec = self.get_body_com("fingertip") - self.get_body_com("target") reward_dist = -np.linalg.norm(vec) * self._reward_dist_weight reward_ctrl = -np.square(action).sum() * self._reward_control_weight - self.do_simulation(action, self.frame_skip) - - observation = self._get_obs() reward = reward_dist + reward_ctrl - info = { + + reward_info = { "reward_dist": reward_dist, "reward_ctrl": reward_ctrl, } - if self.render_mode == "human": - self.render() - return observation, reward, False, False, info + + return reward, reward_info def reset_model(self): qpos = ( diff --git a/gymnasium/envs/mujoco/swimmer_v5.py b/gymnasium/envs/mujoco/swimmer_v5.py index c49267bce..a231cc627 100644 --- a/gymnasium/envs/mujoco/swimmer_v5.py +++ b/gymnasium/envs/mujoco/swimmer_v5.py @@ -234,20 +234,15 @@ def step(self, action): xy_velocity = (xy_position_after - xy_position_before) / self.dt x_velocity, y_velocity = xy_velocity - forward_reward = self._forward_reward_weight * x_velocity - - ctrl_cost = self.control_cost(action) - observation = self._get_obs() - reward = forward_reward - ctrl_cost + reward, reward_info = self._get_rew(x_velocity, action) info = { - "reward_forward": forward_reward, - "reward_ctrl": -ctrl_cost, "x_position": xy_position_after[0], "y_position": xy_position_after[1], "distance_from_origin": np.linalg.norm(xy_position_after, ord=2), "x_velocity": x_velocity, "y_velocity": y_velocity, + **reward_info, } if self.render_mode == "human": @@ -255,6 +250,19 @@ def step(self, action): return observation, reward, False, False, info + def _get_rew(self, x_velocity: float, action): + forward_reward = self._forward_reward_weight * x_velocity + ctrl_cost = self.control_cost(action) + + reward = forward_reward - ctrl_cost + + reward_info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + } + + return reward, reward_info + def _get_obs(self): position = self.data.qpos.flatten() velocity = self.data.qvel.flatten() diff --git a/gymnasium/envs/mujoco/walker2d_v5.py b/gymnasium/envs/mujoco/walker2d_v5.py index dde3e9b03..555ca4944 100644 --- a/gymnasium/envs/mujoco/walker2d_v5.py +++ b/gymnasium/envs/mujoco/walker2d_v5.py @@ -310,24 +310,14 @@ def step(self, action): x_position_after = self.data.qpos[0] x_velocity = (x_position_after - x_position_before) / self.dt - ctrl_cost = self.control_cost(action) - - forward_reward = self._forward_reward_weight * x_velocity - healthy_reward = self.healthy_reward - - rewards = forward_reward + healthy_reward - costs = ctrl_cost - observation = self._get_obs() - reward = rewards - costs + reward, reward_info = self._get_rew(x_velocity, action) terminated = self.terminated info = { - "reward_forward": forward_reward, - "reward_ctrl": -ctrl_cost, - "reward_survive": healthy_reward, "x_position": x_position_after, "z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1], "x_velocity": x_velocity, + **reward_info, } if self.render_mode == "human": @@ -335,6 +325,23 @@ def step(self, action): return observation, reward, terminated, False, info + def _get_rew(self, x_velocity: float, action): + forward_reward = self._forward_reward_weight * x_velocity + healthy_reward = self.healthy_reward + rewards = forward_reward + healthy_reward + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + reward = rewards - costs + + reward_info = { + "reward_forward": forward_reward, + "reward_ctrl": -ctrl_cost, + "reward_survive": healthy_reward, + } + + return reward, reward_info + def reset_model(self): noise_low = -self._reset_noise_scale noise_high = self._reset_noise_scale