aec wrappers donezo, normalized vec integration tested

Farama-Foundation · Nov 21, 2023 · 69aebac · 69aebac
1 parent eb9d74e
commit 69aebac
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 20 deletions.
diff --git a/momaland/utils/aec_wrappers.py b/momaland/utils/aec_wrappers.py
@@ -1,31 +1,130 @@
-"""Various wrappers for AEC MO environments
-"""
+"""Various wrappers for AEC MO environments."""
 
 import numpy as np
 
+
 class Wrapper:
-    """Base class for wrappers.
-    """
+    """Base class for wrappers."""
 
     def __init__(self, env):
+        """Base wrapper initialization to save the base env."""
         self._env = env
 
     def __getattr__(self, name):
-        """Provide proxy access to regular attributes of wrapped objects.
-        """
+        """Provide proxy access to regular attributes of wrapped objects."""
         return getattr(self._env, name)
 
+
 class LinearizeReward(Wrapper):
     """Convert MO reward vector into scalar SO reward value.
+
+    `weights` represents the weights of each objective in the reward vector space.
     """
 
-    def __init__(self, env, weights:np.ndarray):
+    def __init__(self, env, weights: np.ndarray):
+        """Reward linearization class initializer.
+
+        Args:
+            env: base env to add the wrapper on.
+            weights: a ndarray the size of the reward vector representing the weights of the rewards.
+        """
         self.weights = weights
         super().__init__(env)
 
     def last(self):
-        """Returns a reward scalar from the reward vector.
+        """Returns a reward scalar from the reward vector."""
+        observation, rewards, termination, truncation, info = self._env.last()
+        rewards = np.array([np.dot(rewards, self.weights)])
+        return observation, rewards, termination, truncation, info
+
+
+class RunningMeanStd:
+    """Tracks the mean, variance and count of values."""
+
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-4, shape=()):
+        """Tracks the mean, variance and count of values."""
+        self.mean = np.zeros(shape, "float64")
+        self.var = np.ones(shape, "float64")
+        self.count = epsilon
+
+    def update(self, x):
+        """Updates the mean, var and count from a batch of samples."""
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        """Updates from batch mean, variance and count moments."""
+        self.mean, self.var, self.count = update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var, batch_count
+        )
+
+
+def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
+    """Updates the mean, var and count using the previous mean, var, count and batch values."""
+    delta = batch_mean - mean
+    tot_count = count + batch_count
+
+    new_mean = mean + delta * batch_count / tot_count
+    m_a = var * count
+    m_b = batch_var * batch_count
+    m2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
+    new_var = m2 / tot_count
+    new_count = tot_count
+
+    return new_mean, new_var, new_count
+
+
+class NormalizeReward(Wrapper):
+    r"""This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
+
+    The exponential moving average will have variance :math:`(1 - \gamma)^2`.
+
+    Note:
+        The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly
+        instantiated or the policy was changed recently.
+    """
+
+    def __init__(
+        self,
+        env,
+        agent,
+        indices,
+        gamma: float = 0.99,
+        epsilon: float = 1e-8,
+    ):
+        """This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
+
+        Args:
+            env: The environment to apply the wrapper
+            agent: the name of the agent in string whose reward(s) will be normalized.
+            indices: a ndarray with the indices for the values in the reward vector that should be normalized.
+            epsilon: A stability parameter
+            gamma: The discount factor that is used in the exponential moving average.
         """
-        obs, rew, term, trun, info = self._env.last()
-        rew = np.dot(rew, self.weights)
-        return obs, rew, term, trun, info
+        super().__init__(env)
+        self._env = env
+        self.agent = agent
+        self.indices = indices
+        self.num_rewards = self._env.reward_spaces[agent].shape[0]
+        self.return_rms = np.array(
+            [RunningMeanStd(shape=()) for _ in range(self.num_rewards)]
+        )  # separate runningmeanstd for each obj
+        self.returns = 0
+        self.gamma = gamma
+        self.epsilon = epsilon
+
+    def last(self):
+        """Returns the last obs, rew, term, trunc, info; normalizing the rewards returned."""
+        observation, reward, termination, truncation, info = self._env.last()
+        self.returns = self.returns * self.gamma * (1 - termination) + reward
+        for i in self.indices:
+            reward[i] = self.normalize(reward[i], i)
+        return observation, reward, termination, truncation, info
+
+    def normalize(self, rews, i):
+        """Normalizes the rewards with the running mean rewards and their variance."""
+        self.return_rms[i].update(self.returns)
+        return rews / np.sqrt(self.return_rms[i].var + self.epsilon)
diff --git a/momaland/utils/parallel_wrappers.py b/momaland/utils/parallel_wrappers.py
@@ -33,13 +33,13 @@ def __init__(self, env, weights: np.ndarray):
 
     def step(self, actions):
         """Returns a reward scalar from the reward vector."""
-        observations, rewards, terminations, truncation, infos = self._env.step(actions)
+        observations, rewards, terminations, truncations, infos = self._env.step(actions)
         _rewards = np.array([np.dot(rewards[agent], self.weights) for agent in rewards.keys()])
         i = 0
         for key, _ in rewards.items():
             rewards[key] = np.array([_rewards[i]])
             i += 1
-        return observations, rewards, terminations, truncation, infos
+        return observations, rewards, terminations, truncations, infos
 
 
 class RunningMeanStd:
@@ -122,15 +122,15 @@ def __init__(
 
     def step(self, actions):
         """Steps through the environment, normalizing the rewards returned."""
-        observations, rewards, terminateds, truncateds, infos = self._env.step(actions)
+        observations, rewards, terminations, truncations, infos = self._env.step(actions)
         reward = np.array(rewards[self.agent])
-        self.returns = self.returns * self.gamma * (1 - terminateds[self.agent]) + reward
+        self.returns = self.returns * self.gamma * (1 - terminations[self.agent]) + reward
 
         for i in self.indices:
             reward[i] = self.normalize(reward[i], i)
 
         rewards[self.agent] = reward
-        return observations, rewards, terminateds, truncateds, infos
+        return observations, rewards, terminations, truncations, infos
 
     def normalize(self, rews, i):
         """Normalizes the rewards with the running mean rewards and their variance."""

diff --git a/wrapper_testing.py b/wrapper_testing.py
@@ -30,12 +30,16 @@ def parallel_test():
 def aec_test():
     """Full AECEnv lifecycle for testing wrappers."""
     env = _env.env(shared_reward=False)
-    env = AECWrappers.LinearizeReward(env, np.array([0.3, 0.3, 0.4]))
+    env = AECWrappers.LinearizeReward(env, np.array([0.33, 0.33, 0.33]))
+    env = AECWrappers.NormalizeReward(env, env.possible_agents[0], [0])
+    env = AECWrappers.NormalizeReward(env, env.possible_agents[1], [0])
+    env = AECWrappers.NormalizeReward(env, env.possible_agents[2], [0])
+
     env.reset(seed=42)
 
     for agent in env.agent_iter():
         _, reward, termination, truncation, _ = env.last()
-        print(reward)
+        print(agent, reward)
         if termination or truncation:
             action = None
         else:
@@ -46,5 +50,5 @@ def aec_test():
 
 
 if __name__ == "__main__":
-    # aec_test()
-    parallel_test()
+    aec_test()
+    # parallel_test()