-
Notifications
You must be signed in to change notification settings - Fork 0
/
self_balancer_env.py
93 lines (68 loc) · 3.47 KB
/
self_balancer_env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from gym import spaces
from mujoco_py import load_model_from_path, MjSim, MjViewer
import numpy as np
class SelfBalancerEnv:
def __init__(self):
# load model from xml file
model = load_model_from_path('sbp_model.xml')
# load simulator that compiles dynamics of model
self.sim = MjSim(model)
# load viewer for visualizing model dynamics in screen
self.viewer = MjViewer(self.sim)
sim_state = self.sim.get_state()
self.sim.set_state(sim_state)
# The topmost platform is observed as polar coordinates using two angles ranging between -90 to 90
self.observation_space = spaces.Box(low=-np.pi / 2.0, high=np.pi / 2.0, shape=(2,))
# The action space is target rotation of motors
self.action_space = spaces.Box(low=-np.pi / 2.0, high=np.pi / 2.0, shape=(2,))
# gear ratio means ratio of number of rotations of a driver gear to the number of rotations of a driven gear
self.servo_gear_ratio = 20
self.hand_gear_ratio = 50
def step(self, action):
# clip action between -90 and 90 degrees
action = np.clip(action, -np.pi / 2, np.pi / 2)
# set action as target motor rotation (first two actions are for motor and last two action are for hand)
self.sim.data.ctrl[2:] = action * self.servo_gear_ratio
# execute the action
self.sim.step()
# get new observation
observation = self._get_observation()
# get reward obtained from new observation by execution of an action
reward = self._get_reward(observation)
# when reward is below certain threshold we mark it as done
done = reward < -0.5
return observation, reward, done, {}
def _get_observation(self):
state = self.sim.get_state()
# The net position of topmost platform is calculated using motor position and hand offset, so we sum
net_position = state.qpos[:2] + state.qpos[2:]
# net_velocity = state.qvel[:2] + state.qvel[2:]
# Since we deal with only position so we only return them
return net_position
# reward calculates how far from balanced position
def _get_reward(self, observation=None):
return 1 - max(np.abs(self._get_observation())) if observation is None else 1 - max(np.abs(observation))
# obtain random scalar angle between -90 and 90 (used for noise)
@staticmethod
def _random_angle_delta():
return np.random.randint(-1, 2) * np.pi / 180.0
# render current dynamics
def render(self):
self.viewer.render()
# randomly generate hand position by moving in random direction, 1 degree at a time
def noise(self):
# set action as noise (first two actions are for motor and last two action are for hand)
x, y = self._random_angle_delta(), self._random_angle_delta()
self.sim.data.ctrl[:2] += np.array([x, y]) * self.hand_gear_ratio
self.sim.step()
return self.sim.data.ctrl[:2] / self.hand_gear_ratio
# set noise from outside
def set_noise(self, noise):
self.sim.data.ctrl[:2] = np.array(noise) * self.hand_gear_ratio
self.sim.step()
# reset sets the environment to start_pos which by default is the perfect balanced position without noise
def reset(self, start_pos=None):
self.sim.reset()
self.sim.data.qpos[:] = [0, 0, 0, 0] if start_pos is None else start_pos
self.sim.data.qvel[:] = [0, 0, 0, 0]
return self._get_observation()