Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fix: RM transitions #51

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,7 @@ python3 play.py --env <environment_id>
```

where `<environment_id>` can be any of the office, craft, or water domains. To control the agent, use the WASD keys. The environments are described in the paper.

## Note: Bug fix

There was a bug in the way that environment terminal states were handled which is now fixed (see PR-51). The bug only affected tasks in which there are two ways to reach a terminal state with different rewards. Namely, some tasks in Office World and some in Water World (Craft World and Half-Cheetah shouldn't be affected). The results after the fix are consistent with the conclusions of the paper in the Office World domain (Water World hasn't been re-run yet).
11 changes: 6 additions & 5 deletions reward_machines/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@
max_episode_steps=1000
)

register(
id='Office-single-v0',
entry_point='envs.grids.grid_environment:OfficeRM3Env',
max_episode_steps=1000
)
for i in range(1, 5):
register(
id=f'Office-single-T{i}-v0',
entry_point=f'envs.grids.grid_environment:OfficeRM{i}Env',
max_episode_steps=1000
)

# ----------------------------------------- CRAFT
for i in range(11):
Expand Down
18 changes: 18 additions & 0 deletions reward_machines/envs/grids/grid_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,30 @@ def __init__(self):
env = OfficeWorld()
super().__init__(GridEnv(env),rm_files)

class OfficeRM1Env(GridRMEnv):
def __init__(self):
rm_files = ["./envs/grids/reward_machines/office/t1.txt"]
env = OfficeWorld()
super().__init__(GridEnv(env),rm_files)

class OfficeRM2Env(GridRMEnv):
def __init__(self):
rm_files = ["./envs/grids/reward_machines/office/t2.txt"]
env = OfficeWorld()
super().__init__(GridEnv(env),rm_files)

class OfficeRM3Env(GridRMEnv):
def __init__(self):
rm_files = ["./envs/grids/reward_machines/office/t3.txt"]
env = OfficeWorld()
super().__init__(GridEnv(env),rm_files)

class OfficeRM4Env(GridRMEnv):
def __init__(self):
rm_files = ["./envs/grids/reward_machines/office/t4.txt"]
env = OfficeWorld()
super().__init__(GridEnv(env),rm_files)

class CraftRMEnv(GridRMEnv):
def __init__(self, file_map):
rm_files = ["./envs/grids/reward_machines/craft/t%d.txt"%i for i in range(1,11)]
Expand Down
19 changes: 9 additions & 10 deletions reward_machines/envs/grids/reward_machines/office/t3.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
0 # initial state
[1] # terminal state
(0,0,'!e&!f&!n',ConstantRewardFunction(0))
(0,2,'e&!n',ConstantRewardFunction(0))
(0,3,'!e&f&!n',ConstantRewardFunction(0))
(2,2,'!f&!n',ConstantRewardFunction(0))
(2,4,'f&!n',ConstantRewardFunction(0))
(3,3,'!e&!n',ConstantRewardFunction(0))
(3,4,'e&!n',ConstantRewardFunction(0))
(4,1,'g&!n',ConstantRewardFunction(1))
(4,4,'!g&!n',ConstantRewardFunction(0))
[4] # terminal state
(0,0,'!a&!n',ConstantRewardFunction(0))
(0,1,'a&!n',ConstantRewardFunction(0))
(1,1,'!b&!n',ConstantRewardFunction(0))
(1,2,'b&!n',ConstantRewardFunction(0))
(2,2,'!c&!n',ConstantRewardFunction(0))
(2,3,'c&!n',ConstantRewardFunction(0))
(3,3,'!d&!n',ConstantRewardFunction(0))
(3,4,'d&!n',ConstantRewardFunction(1))
19 changes: 10 additions & 9 deletions reward_machines/envs/grids/reward_machines/office/t4.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
0 # initial state
[4] # terminal state
(0,0,'!a&!n',ConstantRewardFunction(0))
(0,1,'a&!n',ConstantRewardFunction(0))
(1,1,'!b&!n',ConstantRewardFunction(0))
(1,2,'b&!n',ConstantRewardFunction(0))
(2,2,'!c&!n',ConstantRewardFunction(0))
(2,3,'c&!n',ConstantRewardFunction(0))
(3,3,'!d&!n',ConstantRewardFunction(0))
(3,4,'d&!n',ConstantRewardFunction(1))
[1] # terminal state
(0,0,'!e&!f&!n',ConstantRewardFunction(0))
(0,2,'e&!n',ConstantRewardFunction(0))
(0,3,'!e&f&!n',ConstantRewardFunction(0))
(2,2,'!f&!n',ConstantRewardFunction(0))
(2,4,'f&!n',ConstantRewardFunction(0))
(3,3,'!e&!n',ConstantRewardFunction(0))
(3,4,'e&!n',ConstantRewardFunction(0))
(4,1,'g&!n',ConstantRewardFunction(1))
(4,4,'!g&!n',ConstantRewardFunction(0))
54 changes: 28 additions & 26 deletions reward_machines/reward_machines/reward_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ def __init__(self, file):
self.u0 = None # initial state
self.delta_u = {} # state-transition function
self.delta_r = {} # reward-transition function
self.terminal_u = -1 # All terminal states are sent to the same terminal state with id *-1*
self.terminal_u = -1
self.terminal_states = [self.terminal_u]
self._load_reward_machine(file)
self.known_transitions = {} # Auxiliary variable to speed up computation of the next RM state

Expand All @@ -22,25 +23,26 @@ def add_reward_shaping(self, gamma, rs_gamma):
- rs_gamma(float): this gamma that is used in the value iteration that compute the shaping potentials
"""
self.gamma = gamma
self.potentials = value_iteration(self.U, self.delta_u, self.delta_r, self.terminal_u, rs_gamma)
self.potentials = value_iteration(self.U, self.delta_u, self.delta_r, self.terminal_states, rs_gamma)
for u in self.potentials:
self.potentials[u] = -self.potentials[u]


def reset(self):
# Returns the initial state
return self.u0

def _compute_next_state(self, u1, true_props):
for u2 in self.delta_u[u1]:
if evaluate_dnf(self.delta_u[u1][u2], true_props):
return u2
return self.terminal_u # no transition is defined for true_props
""" Given the current state u1 and a proposition valuation, it returns the next state """
for phi, next_state in self.delta_u[u1].items():
if evaluate_dnf(phi, true_props):
# at most one formula should be satisfied by the valuation
return next_state, phi
return self.terminal_u, "no formula"

def get_next_state(self, u1, true_props):
if (u1,true_props) not in self.known_transitions:
u2 = self._compute_next_state(u1, true_props)
self.known_transitions[(u1,true_props)] = u2
u2, phi = self._compute_next_state(u1, true_props)
self.known_transitions[(u1,true_props)] = u2, phi
return self.known_transitions[(u1,true_props)]

def step(self, u1, true_props, s_info, add_rs=False, env_done=False):
Expand All @@ -50,11 +52,11 @@ def step(self, u1, true_props, s_info, add_rs=False, env_done=False):
"""

# Computing the next state in the RM and checking if the episode is done
assert u1 != self.terminal_u, "the RM was set to a terminal state!"
u2 = self.get_next_state(u1, true_props)
done = (u2 == self.terminal_u)
assert u1 not in self.terminal_states, f"the RM was set to a terminal state! {u1}"
u2, phi = self.get_next_state(u1, true_props)
done = u2 in self.terminal_states
# Getting the reward
rew = self._get_reward(u1,u2,s_info,add_rs, env_done)
rew = self._get_reward(u1, phi, u2, s_info, add_rs, env_done)

return u2, rew, done

Expand All @@ -64,23 +66,25 @@ def get_states(self):

def get_useful_transitions(self, u1):
# This is an auxiliary method used by the HRL baseline to prune "useless" options
return [self.delta_u[u1][u2].split("&") for u2 in self.delta_u[u1] if u1 != u2]
return [phi.split("&") for phi, u2 in self.delta_u[u1].items() if u1 != u2]


# Private methods -----------------------------------

def _get_reward(self,u1,u2,s_info,add_rs,env_done):
def _get_reward(self, u1, phi, u2, s_info, add_rs, env_done):
"""
Returns the reward associated to this transition.
"""
# Getting reward from the RM
reward = 0 # NOTE: if the agent falls from the reward machine it receives reward of zero
if u1 in self.delta_r and u2 in self.delta_r[u1]:
reward += self.delta_r[u1][u2].get_reward(s_info)
reward = 0
# NOTE: if the agent falls from the reward machine, phi == "no formula" and it receives reward of zero.
if u1 in self.delta_r and phi in self.delta_r[u1]:
assert u1 in self.delta_u and phi in self.delta_u[u1] and self.delta_u[u1][phi] == u2
reward += self.delta_r[u1][phi].get_reward(s_info)
# Adding the reward shaping (if needed)
rs = 0.0
if add_rs:
un = self.terminal_u if env_done else u2 # If the env reached a terminal state, we have to use the potential from the terminal RM state to keep RS optimality guarantees
un = self.terminal_u if env_done else u2 # If the env reached a terminal state, we have to use the potential from the terminal RM state to keep RS optimality guarantees
rs = self.gamma * self.potentials[un] - self.potentials[u1]
# Returning final reward
return reward + rs
Expand All @@ -103,30 +107,28 @@ def _load_reward_machine(self, file):
f.close()
# setting the DFA
self.u0 = eval(lines[0])
terminal_states = eval(lines[1])
self.terminal_states += eval(lines[1])
# adding transitions
for e in lines[2:]:
# Reading the transition
u1, u2, dnf_formula, reward_function = eval(e)
# terminal states
if u1 in terminal_states:
if u1 in self.terminal_states:
continue
if u2 in terminal_states:
u2 = self.terminal_u
# Adding machine state
self._add_state([u1,u2])
# Adding state-transition to delta_u
if u1 not in self.delta_u:
self.delta_u[u1] = {}
self.delta_u[u1][u2] = dnf_formula
self.delta_u[u1][dnf_formula] = u2
# Adding reward-transition to delta_r
if u1 not in self.delta_r:
self.delta_r[u1] = {}
self.delta_r[u1][u2] = reward_function
self.delta_r[u1][dnf_formula] = reward_function
# Sorting self.U... just because...
self.U = sorted(self.U)

def _add_state(self, u_list):
for u in u_list:
if u not in self.U and u != self.terminal_u:
if u not in self.U and u not in self.terminal_states:
self.U.append(u)
11 changes: 6 additions & 5 deletions reward_machines/reward_machines/reward_machine_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,22 @@ def evaluate_dnf(formula,true_props):
if formula == "False": return False
return formula in true_props

def value_iteration(U, delta_u, delta_r, terminal_u, gamma):
def value_iteration(U, delta_u, delta_r, terminal_states, gamma):
"""
Standard value iteration approach.
We use it to compute the potential function for the automated reward shaping
"""
V = dict([(u,0) for u in U])
V[terminal_u] = 0
for s in terminal_states:
V[s] = 0
V_error = 1
while V_error > 0.0000001:
V_error = 0
for u1 in U:
q_u2 = []
for u2 in delta_u[u1]:
if delta_r[u1][u2].get_type() == "constant":
r = delta_r[u1][u2].get_reward(None)
for phi, u2 in delta_u[u1].items():
if delta_r[u1][phi].get_type() == "constant":
r = delta_r[u1][phi].get_reward(None)
else:
r = 0 # If the reward function is not constant, we assume it returns a reward of zero
q_u2.append(r+gamma*V[u2])
Expand Down
11 changes: 7 additions & 4 deletions reward_machines/reward_machines/rm_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,13 @@ def __init__(self, env, r_min, r_max, use_self_loops, add_rs, gamma, rs_gamma):
# Extracting the set of options available (one per edge in the RM)
if use_self_loops:
# This version includes options for self-loops!
self.options = [(rm_id,u1,u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u for u2 in rm.delta_u[u1]]
self.options = list({(rm_id, u1, u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u
for phi, u2 in rm.delta_u[u1].items()})
else:
# This version does not include options for the self-loops!
self.options = [(rm_id,u1,u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u for u2 in rm.delta_u[u1] if u1 != u2]
self.options = list({(rm_id, u1, u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u
for phi, u2 in rm.delta_u[u1].items() if u1 != u2})

self.num_options = len(self.options)
self.valid_options = {}
self.option_features = {}
Expand Down Expand Up @@ -306,9 +309,9 @@ def _get_option_experience(self, option_id, obs, action, next_obs, env_done, tru

# Computing the reward for the option
opt_rew = rm_rew
if u1 != u2 == un:
if u1 != u2 and u2 == un:
opt_rew += self.r_max # Extra positive reward because the agent accomplished this option
elif done:
elif done:
opt_rew += self.r_min # Extra negative reward because the agent failed to accomplish this option

return opt_obs,action,opt_rew,opt_next_obs,done
Expand Down
14 changes: 7 additions & 7 deletions scripts/run_office.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
cd ../reward_machines
for i in `seq 0 59`;
for i in `seq 0 59`;
do
# Multi-task
python3.6 run.py --alg=qlearning --env=Office-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql/office/M1/$i
Expand All @@ -11,10 +11,10 @@ do
python3.6 run.py --alg=hrm --env=Office-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm-rs/office/M1/$i --use_rs

# Single task
python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql/office-single/M1/$i
python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql-rs/office-single/M1/$i --use_rs
python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm/office-single/M1/$i --use_crm
python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm-rs/office-single/M1/$i --use_crm --use_rs
python3.6 run.py --alg=hrm --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm/office-single/M1/$i
python3.6 run.py --alg=hrm --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm-rs/office-single/M1/$i --use_rs
python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql/office-single/M1/$i
python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql-rs/office-single/M1/$i --use_rs
python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm/office-single/M1/$i --use_crm
python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm-rs/office-single/M1/$i --use_crm --use_rs
python3.6 run.py --alg=hrm --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm/office-single/M1/$i
python3.6 run.py --alg=hrm --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm-rs/office-single/M1/$i --use_rs
done