Skip to content

Commit

Permalink
aec wrappers donezo, normalized vec integration tested
Browse files Browse the repository at this point in the history
  • Loading branch information
umutucak committed Nov 21, 2023
1 parent eb9d74e commit 69aebac
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 20 deletions.
121 changes: 110 additions & 11 deletions momaland/utils/aec_wrappers.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,130 @@
"""Various wrappers for AEC MO environments
"""
"""Various wrappers for AEC MO environments."""

import numpy as np


class Wrapper:
"""Base class for wrappers.
"""
"""Base class for wrappers."""

def __init__(self, env):
"""Base wrapper initialization to save the base env."""
self._env = env

def __getattr__(self, name):
"""Provide proxy access to regular attributes of wrapped objects.
"""
"""Provide proxy access to regular attributes of wrapped objects."""
return getattr(self._env, name)


class LinearizeReward(Wrapper):
"""Convert MO reward vector into scalar SO reward value.
`weights` represents the weights of each objective in the reward vector space.
"""

def __init__(self, env, weights:np.ndarray):
def __init__(self, env, weights: np.ndarray):
"""Reward linearization class initializer.
Args:
env: base env to add the wrapper on.
weights: a ndarray the size of the reward vector representing the weights of the rewards.
"""
self.weights = weights
super().__init__(env)

def last(self):
"""Returns a reward scalar from the reward vector.
"""Returns a reward scalar from the reward vector."""
observation, rewards, termination, truncation, info = self._env.last()
rewards = np.array([np.dot(rewards, self.weights)])
return observation, rewards, termination, truncation, info


class RunningMeanStd:
"""Tracks the mean, variance and count of values."""

# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-4, shape=()):
"""Tracks the mean, variance and count of values."""
self.mean = np.zeros(shape, "float64")
self.var = np.ones(shape, "float64")
self.count = epsilon

def update(self, x):
"""Updates the mean, var and count from a batch of samples."""
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
batch_count = x.shape[0]
self.update_from_moments(batch_mean, batch_var, batch_count)

def update_from_moments(self, batch_mean, batch_var, batch_count):
"""Updates from batch mean, variance and count moments."""
self.mean, self.var, self.count = update_mean_var_count_from_moments(
self.mean, self.var, self.count, batch_mean, batch_var, batch_count
)


def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
"""Updates the mean, var and count using the previous mean, var, count and batch values."""
delta = batch_mean - mean
tot_count = count + batch_count

new_mean = mean + delta * batch_count / tot_count
m_a = var * count
m_b = batch_var * batch_count
m2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
new_var = m2 / tot_count
new_count = tot_count

return new_mean, new_var, new_count


class NormalizeReward(Wrapper):
r"""This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
The exponential moving average will have variance :math:`(1 - \gamma)^2`.
Note:
The scaling depends on past trajectories and rewards will not be scaled correctly if the wrapper was newly
instantiated or the policy was changed recently.
"""

def __init__(
self,
env,
agent,
indices,
gamma: float = 0.99,
epsilon: float = 1e-8,
):
"""This wrapper will normalize immediate rewards s.t. their exponential moving average has a fixed variance.
Args:
env: The environment to apply the wrapper
agent: the name of the agent in string whose reward(s) will be normalized.
indices: a ndarray with the indices for the values in the reward vector that should be normalized.
epsilon: A stability parameter
gamma: The discount factor that is used in the exponential moving average.
"""
obs, rew, term, trun, info = self._env.last()
rew = np.dot(rew, self.weights)
return obs, rew, term, trun, info
super().__init__(env)
self._env = env
self.agent = agent
self.indices = indices
self.num_rewards = self._env.reward_spaces[agent].shape[0]
self.return_rms = np.array(
[RunningMeanStd(shape=()) for _ in range(self.num_rewards)]
) # separate runningmeanstd for each obj
self.returns = 0
self.gamma = gamma
self.epsilon = epsilon

def last(self):
"""Returns the last obs, rew, term, trunc, info; normalizing the rewards returned."""
observation, reward, termination, truncation, info = self._env.last()
self.returns = self.returns * self.gamma * (1 - termination) + reward
for i in self.indices:
reward[i] = self.normalize(reward[i], i)
return observation, reward, termination, truncation, info

def normalize(self, rews, i):
"""Normalizes the rewards with the running mean rewards and their variance."""
self.return_rms[i].update(self.returns)
return rews / np.sqrt(self.return_rms[i].var + self.epsilon)
10 changes: 5 additions & 5 deletions momaland/utils/parallel_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def __init__(self, env, weights: np.ndarray):

def step(self, actions):
"""Returns a reward scalar from the reward vector."""
observations, rewards, terminations, truncation, infos = self._env.step(actions)
observations, rewards, terminations, truncations, infos = self._env.step(actions)
_rewards = np.array([np.dot(rewards[agent], self.weights) for agent in rewards.keys()])
i = 0
for key, _ in rewards.items():
rewards[key] = np.array([_rewards[i]])
i += 1
return observations, rewards, terminations, truncation, infos
return observations, rewards, terminations, truncations, infos


class RunningMeanStd:
Expand Down Expand Up @@ -122,15 +122,15 @@ def __init__(

def step(self, actions):
"""Steps through the environment, normalizing the rewards returned."""
observations, rewards, terminateds, truncateds, infos = self._env.step(actions)
observations, rewards, terminations, truncations, infos = self._env.step(actions)
reward = np.array(rewards[self.agent])
self.returns = self.returns * self.gamma * (1 - terminateds[self.agent]) + reward
self.returns = self.returns * self.gamma * (1 - terminations[self.agent]) + reward

for i in self.indices:
reward[i] = self.normalize(reward[i], i)

rewards[self.agent] = reward
return observations, rewards, terminateds, truncateds, infos
return observations, rewards, terminations, truncations, infos

def normalize(self, rews, i):
"""Normalizes the rewards with the running mean rewards and their variance."""
Expand Down
12 changes: 8 additions & 4 deletions wrapper_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,16 @@ def parallel_test():
def aec_test():
"""Full AECEnv lifecycle for testing wrappers."""
env = _env.env(shared_reward=False)
env = AECWrappers.LinearizeReward(env, np.array([0.3, 0.3, 0.4]))
env = AECWrappers.LinearizeReward(env, np.array([0.33, 0.33, 0.33]))
env = AECWrappers.NormalizeReward(env, env.possible_agents[0], [0])
env = AECWrappers.NormalizeReward(env, env.possible_agents[1], [0])
env = AECWrappers.NormalizeReward(env, env.possible_agents[2], [0])

env.reset(seed=42)

for agent in env.agent_iter():
_, reward, termination, truncation, _ = env.last()
print(reward)
print(agent, reward)
if termination or truncation:
action = None
else:
Expand All @@ -46,5 +50,5 @@ def aec_test():


if __name__ == "__main__":
# aec_test()
parallel_test()
aec_test()
# parallel_test()

0 comments on commit 69aebac

Please sign in to comment.