Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MuJoCo] factorize _get_rew() out of step() #819

Merged
merged 27 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
cf153f6
Add Hopper and Walker2D models for v5
Kallinteris-Andreas May 2, 2023
bc92449
Merge branch 'Farama-Foundation:main' into main
Kallinteris-Andreas May 9, 2023
0cbdd72
Delete hopper_v5.xml
Kallinteris-Andreas May 9, 2023
db3734e
Delete walker2d_v5.xml
Kallinteris-Andreas May 9, 2023
a2d2e64
General MuJoCo Env Documention Cleanup
Kallinteris-Andreas May 9, 2023
f58bb5e
typofix
Kallinteris-Andreas May 9, 2023
7a4bc32
typo fix
Kallinteris-Andreas May 9, 2023
2418631
update following @pseudo-rnd-thoughts reviews
Kallinteris-Andreas May 9, 2023
3b9080b
Merge branch 'Farama-Foundation:main' into main
Kallinteris-Andreas Jun 5, 2023
77bcb8b
Merge branch 'Farama-Foundation:main' into main
Kallinteris-Andreas Jun 16, 2023
7639d18
refactor `tests/env/test_mojoco.py` ->
Kallinteris-Andreas Jun 16, 2023
8eb1b11
Merge branch 'Farama-Foundation:main' into main
Kallinteris-Andreas Jun 27, 2023
61d0848
Update setup.py
Kallinteris-Andreas Oct 23, 2023
5831a19
do nothing
Kallinteris-Andreas Oct 23, 2023
803dc49
Merge branch 'Farama-Foundation:main' into main
Kallinteris-Andreas Nov 3, 2023
d99cc5d
[MuJoCo] add action space figures
Kallinteris-Andreas Nov 3, 2023
f788bb3
Merge branch 'Farama-Foundation:main' into main
Kallinteris-Andreas Nov 10, 2023
450b471
Merge branch 'Farama-Foundation:main' into mjx
Kallinteris-Andreas Nov 30, 2023
14fb4d8
replace `flat.copy()` with `flatten()`
Kallinteris-Andreas Dec 5, 2023
1583839
Merge branch 'Farama-Foundation:main' into mjx
Kallinteris-Andreas Dec 5, 2023
47a7059
add `MuJoCo.test_model_sensors`
Kallinteris-Andreas Dec 6, 2023
9dc31e2
`test_model_sensors` remove check for standup `v3`
Kallinteris-Andreas Dec 6, 2023
bededa3
factorize `_get_rew()` out of `step`
Kallinteris-Andreas Dec 6, 2023
999d888
some cleanup
Kallinteris-Andreas Dec 6, 2023
0f59baa
support `python==3.8`
Kallinteris-Andreas Dec 6, 2023
76f5e17
fix for real this time
Kallinteris-Andreas Dec 6, 2023
724e47f
`black`
Kallinteris-Andreas Dec 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 20 additions & 11 deletions gymnasium/envs/mujoco/ant_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,23 @@ def step(self, action):
xy_velocity = (xy_position_after - xy_position_before) / self.dt
x_velocity, y_velocity = xy_velocity

observation = self._get_obs()
reward, reward_info = self._get_rew(x_velocity, action)
terminated = self.terminated
info = {
"x_position": self.data.qpos[0],
"y_position": self.data.qpos[1],
"distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2),
"x_velocity": x_velocity,
"y_velocity": y_velocity,
**reward_info,
}

if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info

def _get_rew(self, x_velocity: float, action):
forward_reward = x_velocity * self._forward_reward_weight
healthy_reward = self.healthy_reward
rewards = forward_reward + healthy_reward
Expand All @@ -384,24 +401,16 @@ def step(self, action):
contact_cost = self.contact_cost
costs = ctrl_cost + contact_cost

observation = self._get_obs()
reward = rewards - costs
terminated = self.terminated
info = {

reward_info = {
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
"reward_contact": -contact_cost,
"reward_survive": healthy_reward,
"x_position": self.data.qpos[0],
"y_position": self.data.qpos[1],
"distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2),
"x_velocity": x_velocity,
"y_velocity": y_velocity,
}

if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info
return reward, reward_info

def _get_obs(self):
position = self.data.qpos.flatten()
Expand Down
21 changes: 12 additions & 9 deletions gymnasium/envs/mujoco/half_cheetah_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,22 +245,25 @@ def step(self, action):
x_position_after = self.data.qpos[0]
x_velocity = (x_position_after - x_position_before) / self.dt

ctrl_cost = self.control_cost(action)
observation = self._get_obs()
reward, reward_info = self._get_rew(x_velocity, action)
info = {"x_position": x_position_after, "x_velocity": x_velocity, **reward_info}

if self.render_mode == "human":
self.render()
return observation, reward, False, False, info

def _get_rew(self, x_velocity: float, action):
forward_reward = self._forward_reward_weight * x_velocity
ctrl_cost = self.control_cost(action)

observation = self._get_obs()
reward = forward_reward - ctrl_cost
info = {
"x_position": x_position_after,
"x_velocity": x_velocity,

reward_info = {
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
}

if self.render_mode == "human":
self.render()
return observation, reward, False, False, info
return reward, reward_info

def _get_obs(self):
position = self.data.qpos.flatten()
Expand Down
30 changes: 19 additions & 11 deletions gymnasium/envs/mujoco/hopper_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,29 +316,37 @@ def step(self, action):
x_position_after = self.data.qpos[0]
x_velocity = (x_position_after - x_position_before) / self.dt

ctrl_cost = self.control_cost(action)
observation = self._get_obs()
reward, reward_info = self._get_rew(x_velocity, action)
terminated = self.terminated
info = {
"x_position": x_position_after,
"z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1],
"x_velocity": x_velocity,
**reward_info,
}

if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info

def _get_rew(self, x_velocity: float, action):
forward_reward = self._forward_reward_weight * x_velocity
healthy_reward = self.healthy_reward

rewards = forward_reward + healthy_reward

ctrl_cost = self.control_cost(action)
costs = ctrl_cost

observation = self._get_obs()
reward = rewards - costs
terminated = self.terminated
info = {

reward_info = {
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
"reward_survive": healthy_reward,
"x_position": x_position_after,
"z_distance_from_origin": self.data.qpos[1] - self.init_qpos[1],
"x_velocity": x_velocity,
}

if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info
return reward, reward_info

def reset_model(self):
noise_low = -self._reset_noise_scale
Expand Down
36 changes: 22 additions & 14 deletions gymnasium/envs/mujoco/humanoid_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,36 +497,44 @@ def step(self, action):
xy_velocity = (xy_position_after - xy_position_before) / self.dt
x_velocity, y_velocity = xy_velocity

ctrl_cost = self.control_cost(action)
contact_cost = self.contact_cost
costs = ctrl_cost + contact_cost

forward_reward = self._forward_reward_weight * x_velocity
healthy_reward = self.healthy_reward

rewards = forward_reward + healthy_reward

observation = self._get_obs()
reward = rewards - costs
reward, reward_info = self._get_rew(x_velocity, action)
terminated = self.terminated
info = {
"reward_survive": healthy_reward,
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
"reward_contact": -contact_cost,
"x_position": self.data.qpos[0],
"y_position": self.data.qpos[1],
"tendon_lenght": self.data.ten_length,
"tendon_velocity": self.data.ten_velocity,
"distance_from_origin": np.linalg.norm(self.data.qpos[0:2], ord=2),
"x_velocity": x_velocity,
"y_velocity": y_velocity,
**reward_info,
}

if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info

def _get_rew(self, x_velocity: float, action):
forward_reward = self._forward_reward_weight * x_velocity
healthy_reward = self.healthy_reward
rewards = forward_reward + healthy_reward

ctrl_cost = self.control_cost(action)
contact_cost = self.contact_cost
costs = ctrl_cost + contact_cost

reward = rewards - costs

reward_info = {
"reward_survive": healthy_reward,
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
"reward_contact": -contact_cost,
}

return reward, reward_info

def reset_model(self):
noise_low = -self._reset_noise_scale
noise_high = self._reset_noise_scale
Expand Down
27 changes: 18 additions & 9 deletions gymnasium/envs/mujoco/humanoidstandup_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,21 @@ def step(self, action):
self.do_simulation(action, self.frame_skip)
pos_after = self.data.qpos[2]

reward, reward_info = self._get_rew(pos_after, action)
info = {
"x_position": self.data.qpos[0],
"y_position": self.data.qpos[1],
"z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2],
"tendon_lenght": self.data.ten_length,
"tendon_velocity": self.data.ten_velocity,
**reward_info,
}

if self.render_mode == "human":
self.render()
return self._get_obs(), reward, False, False, info

def _get_rew(self, pos_after: float, action):
uph_cost = (pos_after - 0) / self.model.opt.timestep

quad_ctrl_cost = self._ctrl_cost_weight * np.square(self.data.ctrl).sum()
Expand All @@ -455,20 +470,14 @@ def step(self, action):
quad_impact_cost = np.clip(quad_impact_cost, min_impact_cost, max_impact_cost)

reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
info = {

reward_info = {
"reward_linup": uph_cost,
"reward_quadctrl": -quad_ctrl_cost,
"reward_impact": -quad_impact_cost,
"x_position": self.data.qpos[0],
"y_position": self.data.qpos[1],
"z_distance_from_origin": self.data.qpos[2] - self.init_qpos[2],
"tendon_lenght": self.data.ten_length,
"tendon_velocity": self.data.ten_velocity,
}

if self.render_mode == "human":
self.render()
return self._get_obs(), reward, False, False, info
return reward, reward_info

def reset_model(self):
noise_low = -self._reset_noise_scale
Expand Down
19 changes: 12 additions & 7 deletions gymnasium/envs/mujoco/inverted_double_pendulum_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,27 +194,32 @@ def __init__(
def step(self, action):
self.do_simulation(action, self.frame_skip)

x, _, y = self.data.site_xpos[0]
observation = self._get_obs()
terminated = bool(y <= 1)
reward, reward_info = self._get_rew(x, y, terminated)

x, _, y = self.data.site_xpos[0]
v1, v2 = self.data.qvel[1:3]
info = reward_info

terminated = bool(y <= 1)
if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info

def _get_rew(self, x, y, terminated):
v1, v2 = self.data.qvel[1:3]
dist_penalty = 0.01 * x**2 + (y - 2) ** 2
vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
alive_bonus = self._healthy_reward * int(not terminated)

reward = alive_bonus - dist_penalty - vel_penalty

info = {
reward_info = {
"reward_survive": alive_bonus,
"distance_penalty": -dist_penalty,
"velocity_penalty": -vel_penalty,
}

if self.render_mode == "human":
self.render()
return observation, reward, terminated, False, info
return reward, reward_info

def _get_obs(self):
return np.concatenate(
Expand Down
21 changes: 14 additions & 7 deletions gymnasium/envs/mujoco/pusher_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,25 +220,32 @@ def __init__(
}

def step(self, action):
reward, reward_info = self._get_rew(action)
self.do_simulation(action, self.frame_skip)

observation = self._get_obs()
info = reward_info
if self.render_mode == "human":
self.render()
return observation, reward, False, False, info

def _get_rew(self, action):
vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
vec_2 = self.get_body_com("object") - self.get_body_com("goal")

reward_near = -np.linalg.norm(vec_1) * self._reward_near_weight
reward_dist = -np.linalg.norm(vec_2) * self._reward_dist_weight
reward_ctrl = -np.square(action).sum() * self._reward_control_weight

self.do_simulation(action, self.frame_skip)

observation = self._get_obs()
reward = reward_dist + reward_ctrl + reward_near
info = {

reward_info = {
"reward_dist": reward_dist,
"reward_ctrl": reward_ctrl,
"reward_near": reward_near,
}
if self.render_mode == "human":
self.render()
return observation, reward, False, False, info

return reward, reward_info

def reset_model(self):
qpos = self.init_qpos
Expand Down
21 changes: 14 additions & 7 deletions gymnasium/envs/mujoco/reacher_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,21 +197,28 @@ def __init__(
}

def step(self, action):
reward, reward_info = self._get_rew(action)
self.do_simulation(action, self.frame_skip)

observation = self._get_obs()
info = reward_info
if self.render_mode == "human":
self.render()
return observation, reward, False, False, info

def _get_rew(self, action):
vec = self.get_body_com("fingertip") - self.get_body_com("target")
reward_dist = -np.linalg.norm(vec) * self._reward_dist_weight
reward_ctrl = -np.square(action).sum() * self._reward_control_weight

self.do_simulation(action, self.frame_skip)

observation = self._get_obs()
reward = reward_dist + reward_ctrl
info = {

reward_info = {
"reward_dist": reward_dist,
"reward_ctrl": reward_ctrl,
}
if self.render_mode == "human":
self.render()
return observation, reward, False, False, info

return reward, reward_info

def reset_model(self):
qpos = (
Expand Down
22 changes: 15 additions & 7 deletions gymnasium/envs/mujoco/swimmer_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,27 +234,35 @@ def step(self, action):
xy_velocity = (xy_position_after - xy_position_before) / self.dt
x_velocity, y_velocity = xy_velocity

forward_reward = self._forward_reward_weight * x_velocity

ctrl_cost = self.control_cost(action)

observation = self._get_obs()
reward = forward_reward - ctrl_cost
reward, reward_info = self._get_rew(x_velocity, action)
info = {
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
"x_position": xy_position_after[0],
"y_position": xy_position_after[1],
"distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
"x_velocity": x_velocity,
"y_velocity": y_velocity,
**reward_info,
}

if self.render_mode == "human":
self.render()

return observation, reward, False, False, info

def _get_rew(self, x_velocity: float, action):
forward_reward = self._forward_reward_weight * x_velocity
ctrl_cost = self.control_cost(action)

reward = forward_reward - ctrl_cost

reward_info = {
"reward_forward": forward_reward,
"reward_ctrl": -ctrl_cost,
}

return reward, reward_info

def _get_obs(self):
position = self.data.qpos.flatten()
velocity = self.data.qvel.flatten()
Expand Down
Loading
Loading