RodrigoToroIcarte · tdelgado00 · Feb 12, 2024 · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/README.md b/README.md
@@ -119,3 +119,7 @@ python3 play.py --env <environment_id>
 ```
 
 where `<environment_id>` can be any of the office, craft, or water domains. To control the agent, use the WASD keys. The environments are described in the paper.
+
+## Note: Bug fix
+
+There was a bug in the way that environment terminal states were handled which is now fixed (see PR-51). The bug only affected tasks in which there are two ways to reach a terminal state with different rewards. Namely, some tasks in Office World and some in Water World (Craft World and Half-Cheetah shouldn't be affected). The results after the fix are consistent with the conclusions of the paper in the Office World domain (Water World hasn't been re-run yet).
diff --git a/reward_machines/envs/__init__.py b/reward_machines/envs/__init__.py
@@ -41,11 +41,12 @@
     max_episode_steps=1000
 )
 
-register(
-    id='Office-single-v0',
-    entry_point='envs.grids.grid_environment:OfficeRM3Env',
-    max_episode_steps=1000
-)
+for i in range(1, 5):
+    register(
+        id=f'Office-single-T{i}-v0',
+        entry_point=f'envs.grids.grid_environment:OfficeRM{i}Env',
+        max_episode_steps=1000
+    )
 
 # ----------------------------------------- CRAFT
 for i in range(11):

diff --git a/reward_machines/envs/grids/grid_environment.py b/reward_machines/envs/grids/grid_environment.py
@@ -120,12 +120,30 @@ def __init__(self):
         env = OfficeWorld()
         super().__init__(GridEnv(env),rm_files)
 
+class OfficeRM1Env(GridRMEnv):
+    def __init__(self):
+        rm_files = ["./envs/grids/reward_machines/office/t1.txt"]
+        env = OfficeWorld()
+        super().__init__(GridEnv(env),rm_files)
+
+class OfficeRM2Env(GridRMEnv):
+    def __init__(self):
+        rm_files = ["./envs/grids/reward_machines/office/t2.txt"]
+        env = OfficeWorld()
+        super().__init__(GridEnv(env),rm_files)
+
 class OfficeRM3Env(GridRMEnv):
     def __init__(self):
         rm_files = ["./envs/grids/reward_machines/office/t3.txt"]
         env = OfficeWorld()
         super().__init__(GridEnv(env),rm_files)
 
+class OfficeRM4Env(GridRMEnv):
+    def __init__(self):
+        rm_files = ["./envs/grids/reward_machines/office/t4.txt"]
+        env = OfficeWorld()
+        super().__init__(GridEnv(env),rm_files)
+
 class CraftRMEnv(GridRMEnv):
     def __init__(self, file_map):
         rm_files = ["./envs/grids/reward_machines/craft/t%d.txt"%i for i in range(1,11)]

diff --git a/reward_machines/envs/grids/reward_machines/office/t3.txt b/reward_machines/envs/grids/reward_machines/office/t3.txt
@@ -1,11 +1,10 @@
 0 # initial state
-[1] # terminal state
-(0,0,'!e&!f&!n',ConstantRewardFunction(0))
-(0,2,'e&!n',ConstantRewardFunction(0))
-(0,3,'!e&f&!n',ConstantRewardFunction(0))
-(2,2,'!f&!n',ConstantRewardFunction(0))
-(2,4,'f&!n',ConstantRewardFunction(0))
-(3,3,'!e&!n',ConstantRewardFunction(0))
-(3,4,'e&!n',ConstantRewardFunction(0))
-(4,1,'g&!n',ConstantRewardFunction(1))
-(4,4,'!g&!n',ConstantRewardFunction(0))
+[4] # terminal state
+(0,0,'!a&!n',ConstantRewardFunction(0))
+(0,1,'a&!n',ConstantRewardFunction(0))
+(1,1,'!b&!n',ConstantRewardFunction(0))
+(1,2,'b&!n',ConstantRewardFunction(0))
+(2,2,'!c&!n',ConstantRewardFunction(0))
+(2,3,'c&!n',ConstantRewardFunction(0))
+(3,3,'!d&!n',ConstantRewardFunction(0))
+(3,4,'d&!n',ConstantRewardFunction(1))
diff --git a/reward_machines/envs/grids/reward_machines/office/t4.txt b/reward_machines/envs/grids/reward_machines/office/t4.txt
@@ -1,10 +1,11 @@
 0 # initial state
-[4] # terminal state
-(0,0,'!a&!n',ConstantRewardFunction(0))
-(0,1,'a&!n',ConstantRewardFunction(0))
-(1,1,'!b&!n',ConstantRewardFunction(0))
-(1,2,'b&!n',ConstantRewardFunction(0))
-(2,2,'!c&!n',ConstantRewardFunction(0))
-(2,3,'c&!n',ConstantRewardFunction(0))
-(3,3,'!d&!n',ConstantRewardFunction(0))
-(3,4,'d&!n',ConstantRewardFunction(1))
+[1] # terminal state
+(0,0,'!e&!f&!n',ConstantRewardFunction(0))
+(0,2,'e&!n',ConstantRewardFunction(0))
+(0,3,'!e&f&!n',ConstantRewardFunction(0))
+(2,2,'!f&!n',ConstantRewardFunction(0))
+(2,4,'f&!n',ConstantRewardFunction(0))
+(3,3,'!e&!n',ConstantRewardFunction(0))
+(3,4,'e&!n',ConstantRewardFunction(0))
+(4,1,'g&!n',ConstantRewardFunction(1))
+(4,4,'!g&!n',ConstantRewardFunction(0))
diff --git a/reward_machines/reward_machines/reward_machine.py b/reward_machines/reward_machines/reward_machine.py
@@ -9,7 +9,8 @@ def __init__(self, file):
         self.u0 = None       # initial state
         self.delta_u    = {} # state-transition function
         self.delta_r    = {} # reward-transition function
-        self.terminal_u = -1  # All terminal states are sent to the same terminal state with id *-1*
+        self.terminal_u = -1
+        self.terminal_states = [self.terminal_u]
         self._load_reward_machine(file)
         self.known_transitions = {} # Auxiliary variable to speed up computation of the next RM state
 
@@ -22,25 +23,26 @@ def add_reward_shaping(self, gamma, rs_gamma):
             - rs_gamma(float): this gamma that is used in the value iteration that compute the shaping potentials
         """
         self.gamma    = gamma
-        self.potentials = value_iteration(self.U, self.delta_u, self.delta_r, self.terminal_u, rs_gamma)
+        self.potentials = value_iteration(self.U, self.delta_u, self.delta_r, self.terminal_states, rs_gamma)
         for u in self.potentials:
             self.potentials[u] = -self.potentials[u]
 
-
     def reset(self):
         # Returns the initial state
         return self.u0
 
     def _compute_next_state(self, u1, true_props):
-        for u2 in self.delta_u[u1]:
-            if evaluate_dnf(self.delta_u[u1][u2], true_props):
-                return u2
-        return self.terminal_u # no transition is defined for true_props
+        """ Given the current state u1 and a proposition valuation, it returns the next state """
+        for phi, next_state in self.delta_u[u1].items():
+            if evaluate_dnf(phi, true_props):
+                # at most one formula should be satisfied by the valuation
+                return next_state, phi
+        return self.terminal_u, "no formula"
 
     def get_next_state(self, u1, true_props):
         if (u1,true_props) not in self.known_transitions:
-            u2 = self._compute_next_state(u1, true_props)
-            self.known_transitions[(u1,true_props)] = u2
+            u2, phi = self._compute_next_state(u1, true_props)
+            self.known_transitions[(u1,true_props)] = u2, phi
         return self.known_transitions[(u1,true_props)]
 
     def step(self, u1, true_props, s_info, add_rs=False, env_done=False):
@@ -50,11 +52,11 @@ def step(self, u1, true_props, s_info, add_rs=False, env_done=False):
         """
 
         # Computing the next state in the RM and checking if the episode is done
-        assert u1 != self.terminal_u, "the RM was set to a terminal state!"
-        u2 = self.get_next_state(u1, true_props)
-        done = (u2 == self.terminal_u)
+        assert u1 not in self.terminal_states, f"the RM was set to a terminal state! {u1}"
+        u2, phi = self.get_next_state(u1, true_props)
+        done = u2 in self.terminal_states
         # Getting the reward
-        rew = self._get_reward(u1,u2,s_info,add_rs, env_done)
+        rew = self._get_reward(u1, phi, u2, s_info, add_rs, env_done)
 
         return u2, rew, done
 
@@ -64,23 +66,25 @@ def get_states(self):
 
     def get_useful_transitions(self, u1):
         # This is an auxiliary method used by the HRL baseline to prune "useless" options
-        return [self.delta_u[u1][u2].split("&") for u2 in self.delta_u[u1] if u1 != u2]
+        return [phi.split("&") for phi, u2 in self.delta_u[u1].items() if u1 != u2]
 
 
     # Private methods -----------------------------------
 
-    def _get_reward(self,u1,u2,s_info,add_rs,env_done):
+    def _get_reward(self, u1, phi, u2, s_info, add_rs, env_done):
         """
         Returns the reward associated to this transition.
         """
         # Getting reward from the RM
-        reward = 0 # NOTE: if the agent falls from the reward machine it receives reward of zero
-        if u1 in self.delta_r and u2 in self.delta_r[u1]:
-            reward += self.delta_r[u1][u2].get_reward(s_info)
+        reward = 0
+        # NOTE: if the agent falls from the reward machine, phi == "no formula" and it receives reward of zero.
+        if u1 in self.delta_r and phi in self.delta_r[u1]:
+            assert u1 in self.delta_u and phi in self.delta_u[u1] and self.delta_u[u1][phi] == u2
+            reward += self.delta_r[u1][phi].get_reward(s_info)
         # Adding the reward shaping (if needed)
         rs = 0.0
         if add_rs:
-            un = self.terminal_u if env_done else u2 # If the env reached a terminal state, we have to use the potential from the terminal RM state to keep RS optimality guarantees
+            un = self.terminal_u if env_done else u2  # If the env reached a terminal state, we have to use the potential from the terminal RM state to keep RS optimality guarantees
             rs = self.gamma * self.potentials[un] - self.potentials[u1]
         # Returning final reward
         return reward + rs
@@ -103,30 +107,28 @@ def _load_reward_machine(self, file):
         f.close()
         # setting the DFA
         self.u0 = eval(lines[0])
-        terminal_states = eval(lines[1])
+        self.terminal_states += eval(lines[1])
         # adding transitions
         for e in lines[2:]:
             # Reading the transition
             u1, u2, dnf_formula, reward_function = eval(e)
             # terminal states
-            if u1 in terminal_states:
+            if u1 in self.terminal_states:
                 continue
-            if u2 in terminal_states:
-                u2  = self.terminal_u
             # Adding machine state
             self._add_state([u1,u2])
             # Adding state-transition to delta_u
             if u1 not in self.delta_u:
                 self.delta_u[u1] = {}
-            self.delta_u[u1][u2] = dnf_formula
+            self.delta_u[u1][dnf_formula] = u2
             # Adding reward-transition to delta_r
             if u1 not in self.delta_r:
                 self.delta_r[u1] = {}
-            self.delta_r[u1][u2] = reward_function
+            self.delta_r[u1][dnf_formula] = reward_function
         # Sorting self.U... just because... 
         self.U = sorted(self.U)
 
     def _add_state(self, u_list):
         for u in u_list:
-            if u not in self.U and u != self.terminal_u:
+            if u not in self.U and u not in self.terminal_states:
                 self.U.append(u)
diff --git a/reward_machines/reward_machines/reward_machine_utils.py b/reward_machines/reward_machines/reward_machine_utils.py
@@ -24,21 +24,22 @@ def evaluate_dnf(formula,true_props):
     if formula == "False": return False
     return formula in true_props
 
-def value_iteration(U, delta_u, delta_r, terminal_u, gamma):
+def value_iteration(U, delta_u, delta_r, terminal_states, gamma):
     """
     Standard value iteration approach. 
     We use it to compute the potential function for the automated reward shaping
     """
     V = dict([(u,0) for u in U])
-    V[terminal_u] = 0
+    for s in terminal_states:
+        V[s] = 0
     V_error = 1
     while V_error > 0.0000001:
         V_error = 0
         for u1 in U:
             q_u2 = []
-            for u2 in delta_u[u1]:
-                if delta_r[u1][u2].get_type() == "constant": 
-                    r = delta_r[u1][u2].get_reward(None)
+            for phi, u2 in delta_u[u1].items():
+                if delta_r[u1][phi].get_type() == "constant":
+                    r = delta_r[u1][phi].get_reward(None)
                 else:
                     r = 0 # If the reward function is not constant, we assume it returns a reward of zero
                 q_u2.append(r+gamma*V[u2])

diff --git a/reward_machines/reward_machines/rm_environment.py b/reward_machines/reward_machines/rm_environment.py
@@ -225,10 +225,13 @@ def __init__(self, env, r_min, r_max, use_self_loops, add_rs, gamma, rs_gamma):
         # Extracting the set of options available (one per edge in the RM)
         if use_self_loops:
             # This version includes options for self-loops!
-            self.options = [(rm_id,u1,u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u for u2 in rm.delta_u[u1]]
+            self.options = list({(rm_id, u1, u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u
+                            for phi, u2 in rm.delta_u[u1].items()})
         else:
             # This version does not include options for the self-loops!
-            self.options = [(rm_id,u1,u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u for u2 in rm.delta_u[u1] if u1 != u2]
+            self.options = list({(rm_id, u1, u2) for rm_id, rm in enumerate(env.reward_machines) for u1 in rm.delta_u
+                            for phi, u2 in rm.delta_u[u1].items() if u1 != u2})
+
         self.num_options = len(self.options)
         self.valid_options   = {}
         self.option_features = {}
@@ -306,9 +309,9 @@ def _get_option_experience(self, option_id, obs, action, next_obs, env_done, tru
 
         # Computing the reward for the option
         opt_rew = rm_rew
-        if u1 != u2 == un: 
+        if u1 != u2 and u2 == un:
             opt_rew += self.r_max  # Extra positive reward because the agent accomplished this option
-        elif done: 
+        elif done:
             opt_rew += self.r_min  # Extra negative reward because the agent failed to accomplish this option
 
         return opt_obs,action,opt_rew,opt_next_obs,done

diff --git a/scripts/run_office.sh b/scripts/run_office.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 cd ../reward_machines
-for i in `seq 0 59`; 
+for i in `seq 0 59`;
 do
 	# Multi-task
 	python3.6 run.py --alg=qlearning --env=Office-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql/office/M1/$i
@@ -11,10 +11,10 @@ do
 	python3.6 run.py --alg=hrm --env=Office-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm-rs/office/M1/$i --use_rs
 
 	# Single task
-	python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql/office-single/M1/$i
-	python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql-rs/office-single/M1/$i --use_rs
-	python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm/office-single/M1/$i --use_crm
-	python3.6 run.py --alg=qlearning --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm-rs/office-single/M1/$i --use_crm --use_rs
-	python3.6 run.py --alg=hrm --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm/office-single/M1/$i
-	python3.6 run.py --alg=hrm --env=Office-single-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm-rs/office-single/M1/$i --use_rs
+	python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql/office-single/M1/$i
+	python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/ql-rs/office-single/M1/$i --use_rs
+	python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm/office-single/M1/$i --use_crm
+	python3.6 run.py --alg=qlearning --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/crm-rs/office-single/M1/$i --use_crm --use_rs
+	python3.6 run.py --alg=hrm --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm/office-single/M1/$i
+	python3.6 run.py --alg=hrm --env=Office-single-T1-v0 --num_timesteps=1e5 --gamma=0.9 --log_path=../my_results/hrm-rs/office-single/M1/$i --use_rs
 done