From 1d49049a67f14b79b422dd9521f32be864f419eb Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Tue, 1 Sep 2020 15:49:49 +0530 Subject: [PATCH 01/26] Single actor critic shared params --- genrl/agents/deep/a2c/a2c.py | 30 ++++++++-- genrl/agents/deep/base/base.py | 2 + genrl/agents/deep/base/offpolicy.py | 3 +- genrl/agents/deep/ddpg/ddpg.py | 26 ++++++++- genrl/agents/deep/ppo1/ppo1.py | 29 +++++++++- genrl/core/actor_critic.py | 70 +++++++++++++++++++++++- tests/test_deep/test_agents/test_a2c.py | 8 +++ tests/test_deep/test_agents/test_ddpg.py | 26 +++++++++ tests/test_deep/test_agents/test_ppo1.py | 8 +++ 9 files changed, 188 insertions(+), 14 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index 595b9f14..e990f531 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -66,7 +66,24 @@ def _create_model(self) -> None: state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is not None: + self.ac = get_model("ac", self.network + "s")( + state_dim, + action_dim, + shared_layers=self.shared_layers, + policy_layers=self.policy_layers, + value_layers=self.value_layers, + val_type="V", + discrete=discrete, + action_lim=action_lim, + ).to(self.device) + actor_params = list(self.ac.shared.parameters()) + list( + self.ac.actor.parameters() + ) + critic_params = list(self.ac.shared.parameters()) + list( + self.ac.critic.parameters() + ) + elif isinstance(self.network, str) and self.shared_layers is None: self.ac = get_model("ac", self.network)( state_dim, action_dim, @@ -76,18 +93,21 @@ def _create_model(self) -> None: discrete=discrete, action_lim=action_lim, ).to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() + else: self.ac = self.network.to(self.device) - - # action_dim = self.network.action_dim + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) - self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) def select_action( self, state: np.ndarray, deterministic: bool = False diff --git a/genrl/agents/deep/base/base.py b/genrl/agents/deep/base/base.py index f37907a9..c2067c91 100644 --- a/genrl/agents/deep/base/base.py +++ b/genrl/agents/deep/base/base.py @@ -34,6 +34,7 @@ def __init__( create_model: bool = True, batch_size: int = 64, gamma: float = 0.99, + shared_layers=None, policy_layers: Tuple = (64, 64), value_layers: Tuple = (64, 64), lr_policy: float = 0.0001, @@ -45,6 +46,7 @@ def __init__( self.create_model = create_model self.batch_size = batch_size self.gamma = gamma + self.shared_layers = shared_layers self.policy_layers = policy_layers self.value_layers = value_layers self.lr_policy = lr_policy diff --git a/genrl/agents/deep/base/offpolicy.py b/genrl/agents/deep/base/offpolicy.py index 916a60ec..f64f3dff 100644 --- a/genrl/agents/deep/base/offpolicy.py +++ b/genrl/agents/deep/base/offpolicy.py @@ -174,7 +174,7 @@ def select_action( # add noise to output from policy network if self.noise is not None: - action += self.noise() + action = action + self.noise() return np.clip( action, self.env.action_space.low[0], self.env.action_space.high[0] @@ -233,7 +233,6 @@ def get_target_q_values( next_q_target_values = self.ac_target.get_value( torch.cat([next_states, next_target_actions], dim=-1) ) - target_q_values = rewards + self.gamma * (1 - dones) * next_q_target_values return target_q_values diff --git a/genrl/agents/deep/ddpg/ddpg.py b/genrl/agents/deep/ddpg/ddpg.py index 24f004b6..9adfc27a 100644 --- a/genrl/agents/deep/ddpg/ddpg.py +++ b/genrl/agents/deep/ddpg/ddpg.py @@ -62,7 +62,23 @@ def _create_model(self) -> None: np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is not None: + self.ac = get_model("ac", self.network + "s")( + state_dim, + action_dim, + self.shared_layers, + self.policy_layers, + self.value_layers, + "Qsa", + False, + ).to(self.device) + actor_params = list(self.ac.actor.parameters()) + list( + self.ac.shared.parameters() + ) + critic_params = list(self.ac.critic.parameters()) + list( + self.ac.shared.parameters() + ) + elif isinstance(self.network, str) and self.shared_layers is None: self.ac = get_model("ac", self.network)( state_dim, action_dim, @@ -71,13 +87,17 @@ def _create_model(self) -> None: "Qsa", False, ).to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() else: self.ac = self.network + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() self.ac_target = deepcopy(self.ac).to(self.device) - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) - self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) def update_params(self, update_interval: int) -> None: """Update parameters of the model diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 0987d078..456aa7d1 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -66,10 +66,29 @@ def _create_model(self): state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str): + if isinstance(self.network, str) and self.shared_layers is not None: + self.ac = get_model("ac", self.network + "s")( + state_dim, + action_dim, + shared_layers=self.shared_layers, + policy_layers=self.policy_layers, + value_layers=self.value_layers, + val_typ="V", + discrete=discrete, + action_lim=action_lim, + activation=self.activation, + ).to(self.device) + actor_params = list(self.ac.shared.parameters()) + list( + self.ac.actor.parameters() + ) + critic_params = list(self.ac.shared.parameters()) + list( + self.ac.critic.parameters() + ) + elif isinstance(self.network, str) and self.shared_layers is None: self.ac = get_model("ac", self.network)( state_dim, action_dim, + shared_layers=self.shared_layers, policy_layers=self.policy_layers, value_layers=self.value_layers, val_typ="V", @@ -77,11 +96,15 @@ def _create_model(self): action_lim=action_lim, activation=self.activation, ).to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() else: self.ac = self.network.to(self.device) + actor_params = self.ac.actor.parameters() + critic_params = self.ac.critic.parameters() - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) - self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) def select_action( self, state: np.ndarray, deterministic: bool = False diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 6214ec46..1ce61f72 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -9,7 +9,7 @@ from genrl.core.base import BaseActorCritic from genrl.core.policies import MlpPolicy from genrl.core.values import MlpValue -from genrl.utils.utils import cnn +from genrl.utils.utils import cnn, mlp class MlpActorCritic(BaseActorCritic): @@ -41,6 +41,73 @@ def __init__( self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) +class MlpSharedActorCritic(BaseActorCritic): + """MLP Shared Actor Critic + + Attributes: + state_dim (int): State dimensions of the environment + action_dim (int): Action space dimensions of the environment + hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + val_type (str): Value type of the critic network + discrete (bool): True if the action space is discrete, else False + sac (bool): True if a SAC-like network is needed, else False + activation (str): Activation function to be used. Can be either "tanh" or "relu" + """ + + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + shared_layers: Tuple = (32, 32), + policy_layers: Tuple = (32, 32), + value_layers: Tuple = (32, 32), + val_type: str = "V", + discrete: bool = True, + **kwargs, + ): + super(MlpSharedActorCritic, self).__init__() + self.shared = mlp([state_dim] + list(shared_layers)) + self.actor = MlpPolicy( + shared_layers[-1], action_dim, policy_layers, discrete, **kwargs + ) + self.critic = MlpValue( + shared_layers[-1], action_dim, val_type, value_layers, **kwargs + ) + self.state_dim = state_dim + self.action_dim = action_dim + + def get_features(self, state: torch.Tensor): + features = self.shared(state) + return features + + def get_action(self, state: torch.Tensor, deterministic: bool = False): + state = torch.as_tensor(state).float() + features = self.get_features(state) + action_probs = self.actor(features) + action_probs = nn.Softmax(dim=-1)(action_probs) + + if deterministic: + action = torch.argmax(action_probs, dim=-1).unsqueeze(-1).float() + distribution = None + else: + distribution = Categorical(probs=action_probs) + action = distribution.sample() + + return action, distribution + + def get_value(self, state: torch.Tensor): + state = torch.as_tensor(state).float() + if self.critic.val_type == "Qsa": + features = self.shared(state[:, :, :-1]) + features = torch.cat([features, state[:, :, -1].unsqueeze(-1)], dim=-1) + print(f"features {features.shape}") + value = self.critic(features).float().squeeze(-1) + else: + features = self.shared(state) + value = self.critic(features) + return value + + class MlpSingleActorMultiCritic(BaseActorCritic): """MLP Actor Critic @@ -220,6 +287,7 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: "mlp": MlpActorCritic, "cnn": CNNActorCritic, "mlp12": MlpSingleActorMultiCritic, + "mlps": MlpSharedActorCritic, } diff --git a/tests/test_deep/test_agents/test_a2c.py b/tests/test_deep/test_agents/test_a2c.py index 2b012069..f731f40f 100644 --- a/tests/test_deep/test_agents/test_a2c.py +++ b/tests/test_deep/test_agents/test_a2c.py @@ -19,3 +19,11 @@ def test_a2c_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") + + +def test_a2c_shared(): + env = VectorEnv("CartPole-v0", 1) + algo = A2C("mlp", env, shared_layers=(32, 32), rollout_size=128) + trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) + trainer.train() + shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_ddpg.py b/tests/test_deep/test_agents/test_ddpg.py index ab309518..94670cef 100644 --- a/tests/test_deep/test_agents/test_ddpg.py +++ b/tests/test_deep/test_agents/test_ddpg.py @@ -29,3 +29,29 @@ def test_ddpg(): ) trainer.train() shutil.rmtree("./logs") + + +def test_ddpg_shared(): + env = VectorEnv("Pendulum-v0", 2) + algo = DDPG( + "mlp", + env, + batch_size=5, + noise=NormalActionNoise, + shared_layers=[1, 1], + policy_layers=[1, 1], + value_layers=[1, 1], + ) + + trainer = OffPolicyTrainer( + algo, + env, + log_mode=["csv"], + logdir="./logs", + epochs=4, + max_ep_len=200, + warmup_steps=10, + start_update=10, + ) + trainer.train() + shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py index 3e9feaf2..1bb06a22 100644 --- a/tests/test_deep/test_agents/test_ppo1.py +++ b/tests/test_deep/test_agents/test_ppo1.py @@ -19,3 +19,11 @@ def test_ppo1_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") + + +def test_ppo1_shared(): + env = VectorEnv("CartPole-v0") + algo = PPO1("mlp", env, shared_layers=(32, 32), rollout_size=128) + trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) + trainer.train() + shutil.rmtree("./logs") From ef4a179a321ed5a4f306067a77712948d7b2e93b Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 2 Sep 2020 01:17:47 +0530 Subject: [PATCH 02/26] Shared layers for multi ACs --- genrl/agents/deep/a2c/a2c.py | 32 ++--- genrl/agents/deep/base/offpolicy.py | 2 +- genrl/agents/deep/ddpg/ddpg.py | 27 +--- genrl/agents/deep/ppo1/ppo1.py | 30 +--- genrl/agents/deep/sac/sac.py | 16 +-- genrl/agents/deep/td3/td3.py | 18 ++- genrl/core/actor_critic.py | 160 ++++++++++++++++++++- tests/test_deep/test_agents/test_custom.py | 7 + tests/test_deep/test_agents/test_ppo1.py | 8 -- tests/test_deep/test_agents/test_sac.py | 25 ++++ tests/test_deep/test_agents/test_td3.py | 26 ++++ 11 files changed, 253 insertions(+), 98 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index e990f531..86cd84ad 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -66,8 +66,11 @@ def _create_model(self) -> None: state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str) and self.shared_layers is not None: - self.ac = get_model("ac", self.network + "s")( + if isinstance(self.network, str): + arch_type = self.network + if self.shared_layers is not None: + arch_type += "s" + self.ac = get_model("ac", arch_type)( state_dim, action_dim, shared_layers=self.shared_layers, @@ -77,37 +80,18 @@ def _create_model(self) -> None: discrete=discrete, action_lim=action_lim, ).to(self.device) - actor_params = list(self.ac.shared.parameters()) + list( - self.ac.actor.parameters() - ) - critic_params = list(self.ac.shared.parameters()) + list( - self.ac.critic.parameters() - ) - elif isinstance(self.network, str) and self.shared_layers is None: - self.ac = get_model("ac", self.network)( - state_dim, - action_dim, - policy_layers=self.policy_layers, - value_layers=self.value_layers, - val_type="V", - discrete=discrete, - action_lim=action_lim, - ).to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() else: self.ac = self.network.to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) - self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) + actor_params, critic_params = self.ac.get_params() + self.optimizer_policy = opt.Adam(critic_params, lr=self.lr_policy) + self.optimizer_value = opt.Adam(actor_params, lr=self.lr_value) def select_action( self, state: np.ndarray, deterministic: bool = False diff --git a/genrl/agents/deep/base/offpolicy.py b/genrl/agents/deep/base/offpolicy.py index f64f3dff..656d7911 100644 --- a/genrl/agents/deep/base/offpolicy.py +++ b/genrl/agents/deep/base/offpolicy.py @@ -174,7 +174,7 @@ def select_action( # add noise to output from policy network if self.noise is not None: - action = action + self.noise() + action += self.noise() return np.clip( action, self.env.action_space.low[0], self.env.action_space.high[0] diff --git a/genrl/agents/deep/ddpg/ddpg.py b/genrl/agents/deep/ddpg/ddpg.py index 9adfc27a..0d09314b 100644 --- a/genrl/agents/deep/ddpg/ddpg.py +++ b/genrl/agents/deep/ddpg/ddpg.py @@ -62,8 +62,11 @@ def _create_model(self) -> None: np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) - if isinstance(self.network, str) and self.shared_layers is not None: - self.ac = get_model("ac", self.network + "s")( + if isinstance(self.network, str): + arch_type = self.network + if self.shared_layers is not None: + arch_type += "s" + self.ac = get_model("ac", arch_type)( state_dim, action_dim, self.shared_layers, @@ -72,28 +75,10 @@ def _create_model(self) -> None: "Qsa", False, ).to(self.device) - actor_params = list(self.ac.actor.parameters()) + list( - self.ac.shared.parameters() - ) - critic_params = list(self.ac.critic.parameters()) + list( - self.ac.shared.parameters() - ) - elif isinstance(self.network, str) and self.shared_layers is None: - self.ac = get_model("ac", self.network)( - state_dim, - action_dim, - self.policy_layers, - self.value_layers, - "Qsa", - False, - ).to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() else: self.ac = self.network - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() + actor_params, critic_params = self.ac.get_params() self.ac_target = deepcopy(self.ac).to(self.device) self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 456aa7d1..7359e621 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -66,8 +66,11 @@ def _create_model(self): state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) - if isinstance(self.network, str) and self.shared_layers is not None: - self.ac = get_model("ac", self.network + "s")( + if isinstance(self.network, str): + arch = self.network + if self.shared_layers is not None: + arch += "s" + self.ac = get_model("ac", arch)( state_dim, action_dim, shared_layers=self.shared_layers, @@ -78,31 +81,10 @@ def _create_model(self): action_lim=action_lim, activation=self.activation, ).to(self.device) - actor_params = list(self.ac.shared.parameters()) + list( - self.ac.actor.parameters() - ) - critic_params = list(self.ac.shared.parameters()) + list( - self.ac.critic.parameters() - ) - elif isinstance(self.network, str) and self.shared_layers is None: - self.ac = get_model("ac", self.network)( - state_dim, - action_dim, - shared_layers=self.shared_layers, - policy_layers=self.policy_layers, - value_layers=self.value_layers, - val_typ="V", - discrete=discrete, - action_lim=action_lim, - activation=self.activation, - ).to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() else: self.ac = self.network.to(self.device) - actor_params = self.ac.actor.parameters() - critic_params = self.ac.critic.parameters() + actor_params, critic_params = self.ac.get_params() self.optimizer_policy = opt.Adam(actor_params, lr=self.lr_policy) self.optimizer_value = opt.Adam(critic_params, lr=self.lr_value) diff --git a/genrl/agents/deep/sac/sac.py b/genrl/agents/deep/sac/sac.py index b7a5572d..54c7f87b 100644 --- a/genrl/agents/deep/sac/sac.py +++ b/genrl/agents/deep/sac/sac.py @@ -76,8 +76,10 @@ def _create_model(self, **kwargs) -> None: state_dim, action_dim, discrete, _ = get_env_properties( self.env, self.network ) - - self.ac = get_model("ac", self.network + "12")( + arch = self.network + "12" + if self.shared_layers is not None: + arch += "s" + self.ac = get_model("ac", arch)( state_dim, action_dim, policy_layers=self.policy_layers, @@ -92,13 +94,9 @@ def _create_model(self, **kwargs) -> None: self.model = self.network self.ac_target = deepcopy(self.ac) - - self.critic_params = list(self.ac.critic1.parameters()) + list( - self.ac.critic2.parameters() - ) - - self.optimizer_value = opt.Adam(self.critic_params, self.lr_value) - self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), self.lr_policy) + actor_params, critic_params = self.ac.get_params() + self.optimizer_value = opt.Adam(critic_params, self.lr_value) + self.optimizer_policy = opt.Adam(actor_params, self.lr_policy) if self.entropy_tuning: self.target_entropy = -torch.prod( diff --git a/genrl/agents/deep/td3/td3.py b/genrl/agents/deep/td3/td3.py index a9687446..5a8e83d2 100644 --- a/genrl/agents/deep/td3/td3.py +++ b/genrl/agents/deep/td3/td3.py @@ -68,10 +68,13 @@ def _create_model(self) -> None: ) if isinstance(self.network, str): - # Below, the "12" corresponds to the Single Actor, Double Critic network architecture - self.ac = get_model("ac", self.network + "12")( + arch = self.network + "12" + if self.shared_layers is not None: + arch += "s" + self.ac = get_model("ac", arch)( state_dim, action_dim, + shared_layers=self.shared_layers, policy_layers=self.policy_layers, value_layers=self.value_layers, val_type="Qsa", @@ -86,14 +89,9 @@ def _create_model(self) -> None: ) self.ac_target = deepcopy(self.ac) - - self.critic_params = list(self.ac.critic1.parameters()) + list( - self.ac.critic2.parameters() - ) - self.optimizer_value = torch.optim.Adam(self.critic_params, lr=self.lr_value) - self.optimizer_policy = torch.optim.Adam( - self.ac.actor.parameters(), lr=self.lr_policy - ) + actor_params, critic_params = self.ac.get_params() + self.optimizer_value = torch.optim.Adam(critic_params, lr=self.lr_value) + self.optimizer_policy = torch.optim.Adam(actor_params, lr=self.lr_policy) def update_params(self, update_interval: int) -> None: """Update parameters of the model diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1ce61f72..1b135ebd 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -29,6 +29,7 @@ def __init__( self, state_dim: spaces.Space, action_dim: spaces.Space, + shared_layers: None, policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), val_type: str = "V", @@ -40,6 +41,11 @@ def __init__( self.actor = MlpPolicy(state_dim, action_dim, policy_layers, discrete, **kwargs) self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs) + def get_params(self): + actor_params = self.actor.parameters() + critic_params = self.critic.parameters() + return actor_params, critic_params + class MlpSharedActorCritic(BaseActorCritic): """MLP Shared Actor Critic @@ -76,7 +82,20 @@ def __init__( self.state_dim = state_dim self.action_dim = action_dim + def get_params(self): + actor_params = list(self.shared.parameters()) + list(self.actor.parameters()) + critic_params = list(self.shared.parameters()) + list(self.critic.parameters()) + return actor_params, critic_params + def get_features(self, state: torch.Tensor): + """Extract features from the state, which is then an input to get_action and get_value + + Args: + state (:obj:`torch.Tensor`): The state(s) being passed + + Returns: + features (:obj:`torch.Tensor`): The feature(s) extracted from the state + """ features = self.shared(state) return features @@ -100,7 +119,6 @@ def get_value(self, state: torch.Tensor): if self.critic.val_type == "Qsa": features = self.shared(state[:, :, :-1]) features = torch.cat([features, state[:, :, -1].unsqueeze(-1)], dim=-1) - print(f"features {features.shape}") value = self.critic(features).float().squeeze(-1) else: features = self.shared(state) @@ -144,6 +162,137 @@ def __init__( self.action_scale = kwargs["action_scale"] if "action_scale" in kwargs else 1 self.action_bias = kwargs["action_bias"] if "action_bias" in kwargs else 0 + def get_params(self): + actor_params = self.actor.parameters() + critic_params = list(self.critic1.parameters()) + list( + self.critic2.parameters() + ) + return actor_params, critic_params + + def forward(self, x): + q1_values = self.critic1(x).squeeze(-1) + q2_values = self.critic2(x).squeeze(-1) + return (q1_values, q2_values) + + def get_action(self, state: torch.Tensor, deterministic: bool = False): + state = torch.as_tensor(state).float() + + if self.actor.sac: + mean, log_std = self.actor(state) + std = log_std.exp() + distribution = Normal(mean, std) + + action_probs = distribution.rsample() + log_probs = distribution.log_prob(action_probs) + action_probs = torch.tanh(action_probs) + + action = action_probs * self.action_scale + self.action_bias + + # enforcing action bound (appendix of SAC paper) + log_probs -= torch.log( + self.action_scale * (1 - action_probs.pow(2)) + np.finfo(np.float32).eps + ) + log_probs = log_probs.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + + action = (action.float(), log_probs, mean) + else: + action = self.actor.get_action(state, deterministic=deterministic) + + return action + + def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: + """Get Values from the Critic + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + mode (str): What values should be returned. Types: + "both" --> Both values will be returned + "min" --> The minimum of both values will be returned + "first" --> The value from the first critic only will be returned + + Returns: + values (:obj:`list`): List of values as estimated by each individual critic + """ + state = torch.as_tensor(state).float() + + if mode == "both": + values = self.forward(state) + elif mode == "min": + values = self.forward(state) + values = torch.min(*values).squeeze(-1) + elif mode == "first": + values = self.critic1(state) + else: + raise KeyError("Mode doesn't exist") + + return values + + +class MlpSharedSingleActorMultiCritic(BaseActorCritic): + """MLP Actor Critic + + Attributes: + state_dim (int): State dimensions of the environment + action_dim (int): Action space dimensions of the environment + hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + val_type (str): Value type of the critic network + discrete (bool): True if the action space is discrete, else False + num_critics (int): Number of critics in the architecture + sac (bool): True if a SAC-like network is needed, else False + activation (str): Activation function to be used. Can be either "tanh" or "relu" + """ + + def __init__( + self, + state_dim: spaces.Space, + action_dim: spaces.Space, + shared_layers: Tuple = (32, 32), + policy_layers: Tuple = (32, 32), + value_layers: Tuple = (32, 32), + val_type: str = "V", + discrete: bool = True, + num_critics: int = 2, + **kwargs, + ): + super(MlpSharedSingleActorMultiCritic, self).__init__() + + self.num_critics = num_critics + self.shared = mlp([state_dim] + list(shared_layers)) + self.actor = MlpPolicy( + shared_layers[-1], action_dim, policy_layers, discrete, **kwargs + ) + self.critic1 = MlpValue( + shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs + ) + self.critic2 = MlpValue( + shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs + ) + + self.action_scale = kwargs["action_scale"] if "action_scale" in kwargs else 1 + self.action_bias = kwargs["action_bias"] if "action_bias" in kwargs else 0 + + def get_params(self): + actor_params = list(self.actor.parameters()) + list(self.shared.parameters()) + critic_params = ( + list(self.critic1.parameters()) + + list(self.critic2.parameters()) + + list(self.shared.parameters()) + ) + return actor_params, critic_params + + def get_features(self, state: torch.Tensor): + """Extract features from the state, which is then an input to get_action and get_value + + Args: + state (:obj:`torch.Tensor`): The state(s) being passed + + Returns: + features (:obj:`torch.Tensor`): The feature(s) extracted from the state + """ + features = self.shared(state) + return features + def forward(self, x): q1_values = self.critic1(x).squeeze(-1) q2_values = self.critic2(x).squeeze(-1) @@ -151,6 +300,7 @@ def forward(self, x): def get_action(self, state: torch.Tensor, deterministic: bool = False): state = torch.as_tensor(state).float() + state = self.get_features(state) if self.actor.sac: mean, log_std = self.actor(state) @@ -190,6 +340,8 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: values (:obj:`list`): List of values as estimated by each individual critic """ state = torch.as_tensor(state).float() + x = self.get_features(state[:, :, :-1]) + state = torch.cat([x, state[:, :, -1].unsqueeze(-1)], dim=-1) if mode == "both": values = self.forward(state) @@ -240,6 +392,11 @@ def __init__( ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) + def get_params(self): + actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) + critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) + return actor_params, critic_params + def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: @@ -288,6 +445,7 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: "cnn": CNNActorCritic, "mlp12": MlpSingleActorMultiCritic, "mlps": MlpSharedActorCritic, + "mlp12s": MlpSharedSingleActorMultiCritic, } diff --git a/tests/test_deep/test_agents/test_custom.py b/tests/test_deep/test_agents/test_custom.py index c4614b70..a0c97063 100644 --- a/tests/test_deep/test_agents/test_custom.py +++ b/tests/test_deep/test_agents/test_custom.py @@ -24,6 +24,7 @@ def __init__( self, state_dim, action_dim, + shared_layers=None, policy_layers=(1, 1), value_layers=(1, 1), val_type="V", @@ -32,12 +33,18 @@ def __init__( super(custom_actorcritic, self).__init__( state_dim, action_dim, + shared_layers=shared_layers, policy_layers=policy_layers, value_layers=value_layers, val_type=val_type, **kwargs ) + def get_params(self): + actor_params = self.actor.parameters() + critic_params = self.critic.parameters() + return actor_params, critic_params + def test_custom_vpg(): env = VectorEnv("CartPole-v0", 1) diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py index 1bb06a22..3e9feaf2 100644 --- a/tests/test_deep/test_agents/test_ppo1.py +++ b/tests/test_deep/test_agents/test_ppo1.py @@ -19,11 +19,3 @@ def test_ppo1_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") - - -def test_ppo1_shared(): - env = VectorEnv("CartPole-v0") - algo = PPO1("mlp", env, shared_layers=(32, 32), rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_sac.py b/tests/test_deep/test_agents/test_sac.py index 8755c5c4..3ea1cfee 100644 --- a/tests/test_deep/test_agents/test_sac.py +++ b/tests/test_deep/test_agents/test_sac.py @@ -21,3 +21,28 @@ def test_sac(): ) trainer.train() shutil.rmtree("./logs") + + +def test_sac_shared(): + env = VectorEnv("Pendulum-v0", 2) + algo = SAC( + "mlp", + env, + batch_size=5, + shared_layers=[1, 1], + policy_layers=[1, 1], + value_layers=[1, 1], + ) + + trainer = OffPolicyTrainer( + algo, + env, + log_mode=["csv"], + logdir="./logs", + epochs=5, + max_ep_len=500, + warmup_steps=10, + start_update=10, + ) + trainer.train() + shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_td3.py b/tests/test_deep/test_agents/test_td3.py index e3d59491..35def46f 100644 --- a/tests/test_deep/test_agents/test_td3.py +++ b/tests/test_deep/test_agents/test_td3.py @@ -29,3 +29,29 @@ def test_td3(): ) trainer.train() shutil.rmtree("./logs") + + +def test_td3_shared(): + env = VectorEnv("Pendulum-v0", 2) + algo = TD3( + "mlp", + env, + batch_size=5, + noise=OrnsteinUhlenbeckActionNoise, + shared_layers=[1, 1], + policy_layers=[1, 1], + value_layers=[1, 1], + ) + + trainer = OffPolicyTrainer( + algo, + env, + log_mode=["csv"], + logdir="./logs", + epochs=5, + max_ep_len=500, + warmup_steps=10, + start_update=10, + ) + trainer.train() + shutil.rmtree("./logs") From 53450a8399530e68ba179cbe59ddeaa8f354a503 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 2 Sep 2020 01:28:36 +0530 Subject: [PATCH 03/26] Fix lint errors (1) --- genrl/core/actor_critic.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1b135ebd..1660e132 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -90,12 +90,12 @@ def get_params(self): def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value - Args: - state (:obj:`torch.Tensor`): The state(s) being passed + Args: + state (:obj:`torch.Tensor`): The state(s) being passed - Returns: - features (:obj:`torch.Tensor`): The feature(s) extracted from the state - """ + Returns: + features (:obj:`torch.Tensor`): The feature(s) extracted from the state + """ features = self.shared(state) return features @@ -392,11 +392,6 @@ def __init__( ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) - def get_params(self): - actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) - critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) - return actor_params, critic_params - def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: From 274aff98cb11915be2a984624f1e9c9bc22b2fa4 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 2 Sep 2020 02:14:03 +0530 Subject: [PATCH 04/26] Fixed tests --- genrl/core/actor_critic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index 1660e132..b45222de 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -392,6 +392,11 @@ def __init__( ) self.critic = MlpValue(output_size, action_dim, val_type, value_layers) + def get_params(self): + actor_params = list(self.feature.parameters()) + list(self.actor.parameters()) + critic_params = list(self.feature.parameters()) + list(self.critic.parameters()) + return actor_params, critic_params + def get_action( self, state: torch.Tensor, deterministic: bool = False ) -> torch.Tensor: From 38f95f00ee2397844fb174cda61641134adc4a04 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Thu, 3 Sep 2020 01:38:45 +0530 Subject: [PATCH 05/26] Changes to dicstrings and classes --- genrl/core/actor_critic.py | 141 +++++++++++------------ tests/test_deep/test_agents/test_ppo1.py | 8 ++ 2 files changed, 76 insertions(+), 73 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index b45222de..aa460be7 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -18,7 +18,8 @@ class MlpActorCritic(BaseActorCritic): Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False sac (bool): True if a SAC-like network is needed, else False @@ -53,7 +54,9 @@ class MlpSharedActorCritic(BaseActorCritic): Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False sac (bool): True if a SAC-like network is needed, else False @@ -100,6 +103,18 @@ def get_features(self, state: torch.Tensor): return features def get_action(self, state: torch.Tensor, deterministic: bool = False): + """Get Actions from the actor + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + deterministic (bool): True if the action space is deterministic, else False + + Returns: + action (:obj:`list`): List of actions as estimated by the critic + distribution (): The distribution from which the action was sampled + (None if determinist + """ + state = torch.as_tensor(state).float() features = self.get_features(state) action_probs = self.actor(features) @@ -115,6 +130,14 @@ def get_action(self, state: torch.Tensor, deterministic: bool = False): return action, distribution def get_value(self, state: torch.Tensor): + """Get Values from the Critic + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + + Returns: + values (:obj:`list`): List of values as estimated by the critic + """ state = torch.as_tensor(state).float() if self.critic.val_type == "Qsa": features = self.shared(state[:, :, :-1]) @@ -132,7 +155,8 @@ class MlpSingleActorMultiCritic(BaseActorCritic): Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False num_critics (int): Number of critics in the architecture @@ -175,6 +199,17 @@ def forward(self, x): return (q1_values, q2_values) def get_action(self, state: torch.Tensor, deterministic: bool = False): + """Get Actions from the actor + + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + deterministic (bool): True if the action space is deterministic, else False + + Returns: + action (:obj:`list`): List of actions as estimated by the critic + distribution (): The distribution from which the action was sampled + (None if determinist + """ state = torch.as_tensor(state).float() if self.actor.sac: @@ -229,13 +264,15 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: return values -class MlpSharedSingleActorMultiCritic(BaseActorCritic): +class MlpSharedSingleActorMultiCritic(MlpSingleActorMultiCritic): """MLP Actor Critic Attributes: state_dim (int): State dimensions of the environment action_dim (int): Action space dimensions of the environment - hidden (:obj:`list` or :obj:`tuple`): Hidden layers in the MLP + shared_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the shared MLP + policy_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the policy MLP + value_layers (:obj:`list` or :obj:`tuple`): Hidden layers in the value MLP val_type (str): Value type of the critic network discrete (bool): True if the action space is discrete, else False num_critics (int): Number of critics in the architecture @@ -250,36 +287,22 @@ def __init__( shared_layers: Tuple = (32, 32), policy_layers: Tuple = (32, 32), value_layers: Tuple = (32, 32), - val_type: str = "V", + val_type: str = "Qsa", discrete: bool = True, num_critics: int = 2, **kwargs, ): - super(MlpSharedSingleActorMultiCritic, self).__init__() - - self.num_critics = num_critics - self.shared = mlp([state_dim] + list(shared_layers)) - self.actor = MlpPolicy( - shared_layers[-1], action_dim, policy_layers, discrete, **kwargs - ) - self.critic1 = MlpValue( - shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs + super(MlpSharedSingleActorMultiCritic, self).__init__( + shared_layers[-1], + action_dim, + policy_layers, + value_layers, + val_type, + discrete, + num_critics, + **kwargs, ) - self.critic2 = MlpValue( - shared_layers[-1], action_dim, "Qsa", value_layers, **kwargs - ) - - self.action_scale = kwargs["action_scale"] if "action_scale" in kwargs else 1 - self.action_bias = kwargs["action_bias"] if "action_bias" in kwargs else 0 - - def get_params(self): - actor_params = list(self.actor.parameters()) + list(self.shared.parameters()) - critic_params = ( - list(self.critic1.parameters()) - + list(self.critic2.parameters()) - + list(self.shared.parameters()) - ) - return actor_params, critic_params + self.shared = mlp([state_dim] + list(shared_layers)) def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value @@ -293,41 +316,24 @@ def get_features(self, state: torch.Tensor): features = self.shared(state) return features - def forward(self, x): - q1_values = self.critic1(x).squeeze(-1) - q2_values = self.critic2(x).squeeze(-1) - return (q1_values, q2_values) - def get_action(self, state: torch.Tensor, deterministic: bool = False): - state = torch.as_tensor(state).float() - state = self.get_features(state) + """Get Actions from the actor - if self.actor.sac: - mean, log_std = self.actor(state) - std = log_std.exp() - distribution = Normal(mean, std) - - action_probs = distribution.rsample() - log_probs = distribution.log_prob(action_probs) - action_probs = torch.tanh(action_probs) - - action = action_probs * self.action_scale + self.action_bias - - # enforcing action bound (appendix of SAC paper) - log_probs -= torch.log( - self.action_scale * (1 - action_probs.pow(2)) + np.finfo(np.float32).eps - ) - log_probs = log_probs.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - - action = (action.float(), log_probs, mean) - else: - action = self.actor.get_action(state, deterministic=deterministic) + Arg: + state (:obj:`torch.Tensor`): The state(s) being passed to the critics + deterministic (bool): True if the action space is deterministic, else False - return action + Returns: + action (:obj:`list`): List of actions as estimated by the critic + distribution (): The distribution from which the action was sampled + (None if determinist + """ + return super(MlpSharedSingleActorMultiCritic, self).get_action( + self.get_features(state), deterministic=deterministic + ) - def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: - """Get Values from the Critic + def get_value(self, state: torch.Tensor, mode="first"): + """Get Values from both the Critic Arg: state (:obj:`torch.Tensor`): The state(s) being passed to the critics @@ -342,18 +348,7 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: state = torch.as_tensor(state).float() x = self.get_features(state[:, :, :-1]) state = torch.cat([x, state[:, :, -1].unsqueeze(-1)], dim=-1) - - if mode == "both": - values = self.forward(state) - elif mode == "min": - values = self.forward(state) - values = torch.min(*values).squeeze(-1) - elif mode == "first": - values = self.critic1(state) - else: - raise KeyError("Mode doesn't exist") - - return values + return super(MlpSharedSingleActorMultiCritic, self).get_value(state, mode) class CNNActorCritic(BaseActorCritic): diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py index 3e9feaf2..97d40791 100644 --- a/tests/test_deep/test_agents/test_ppo1.py +++ b/tests/test_deep/test_agents/test_ppo1.py @@ -19,3 +19,11 @@ def test_ppo1_cnn(): trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() shutil.rmtree("./logs") + + +def test_ppo1_shared(): + env = VectorEnv("CartPole-v0") + algo = PPO1("mlp", env, shared_layers=[32, 32], rollout_size=128) + trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) + trainer.train() + shutil.rmtree("./logs") From 835819e193413e59aa3b09f63cd7b385ea3a359a Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Fri, 4 Sep 2020 22:55:36 +0530 Subject: [PATCH 06/26] Renaming Multi -> Two and comments --- genrl/core/actor_critic.py | 82 +++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 24 deletions(-) diff --git a/genrl/core/actor_critic.py b/genrl/core/actor_critic.py index aa460be7..a6fb7e49 100644 --- a/genrl/core/actor_critic.py +++ b/genrl/core/actor_critic.py @@ -75,7 +75,7 @@ def __init__( **kwargs, ): super(MlpSharedActorCritic, self).__init__() - self.shared = mlp([state_dim] + list(shared_layers)) + self.shared_network = mlp([state_dim] + list(shared_layers)) self.actor = MlpPolicy( shared_layers[-1], action_dim, policy_layers, discrete, **kwargs ) @@ -86,8 +86,12 @@ def __init__( self.action_dim = action_dim def get_params(self): - actor_params = list(self.shared.parameters()) + list(self.actor.parameters()) - critic_params = list(self.shared.parameters()) + list(self.critic.parameters()) + actor_params = list(self.shared_network.parameters()) + list( + self.actor.parameters() + ) + critic_params = list(self.shared_network.parameters()) + list( + self.critic.parameters() + ) return actor_params, critic_params def get_features(self, state: torch.Tensor): @@ -99,7 +103,7 @@ def get_features(self, state: torch.Tensor): Returns: features (:obj:`torch.Tensor`): The feature(s) extracted from the state """ - features = self.shared(state) + features = self.shared_network(state) return features def get_action(self, state: torch.Tensor, deterministic: bool = False): @@ -116,8 +120,8 @@ def get_action(self, state: torch.Tensor, deterministic: bool = False): """ state = torch.as_tensor(state).float() - features = self.get_features(state) - action_probs = self.actor(features) + shared_features = self.get_features(state) + action_probs = self.actor(shared_features) action_probs = nn.Softmax(dim=-1)(action_probs) if deterministic: @@ -139,17 +143,28 @@ def get_value(self, state: torch.Tensor): values (:obj:`list`): List of values as estimated by the critic """ state = torch.as_tensor(state).float() + if self.critic.val_type == "Qsa": - features = self.shared(state[:, :, :-1]) - features = torch.cat([features, state[:, :, -1].unsqueeze(-1)], dim=-1) - value = self.critic(features).float().squeeze(-1) + # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] + + # extract shared_features from just the state + # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] + shared_features = self.shared_network(state[:, :, : -self.action_dim]) + + # concatenate the actions to the extracted shared_features + # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] + shared_features = torch.cat( + [shared_features, state[:, :, -self.action_dim :]], dim=-1 + ) + + value = self.critic(shared_features).float().squeeze(-1) else: - features = self.shared(state) - value = self.critic(features) + shared_features = self.shared_network(state) + value = self.critic(shared_features) return value -class MlpSingleActorMultiCritic(BaseActorCritic): +class MlpSingleActorTwoCritic(BaseActorCritic): """MLP Actor Critic Attributes: @@ -175,7 +190,7 @@ def __init__( num_critics: int = 2, **kwargs, ): - super(MlpSingleActorMultiCritic, self).__init__() + super(MlpSingleActorTwoCritic, self).__init__() self.num_critics = num_critics @@ -264,7 +279,7 @@ def get_value(self, state: torch.Tensor, mode="first") -> torch.Tensor: return values -class MlpSharedSingleActorMultiCritic(MlpSingleActorMultiCritic): +class MlpSharedSingleActorTwoCritic(MlpSingleActorTwoCritic): """MLP Actor Critic Attributes: @@ -292,7 +307,7 @@ def __init__( num_critics: int = 2, **kwargs, ): - super(MlpSharedSingleActorMultiCritic, self).__init__( + super(MlpSharedSingleActorTwoCritic, self).__init__( shared_layers[-1], action_dim, policy_layers, @@ -302,7 +317,19 @@ def __init__( num_critics, **kwargs, ) - self.shared = mlp([state_dim] + list(shared_layers)) + self.shared_network = mlp([state_dim] + list(shared_layers)) + self.action_dim = action_dim + + def get_params(self): + actor_params = list(self.shared_network.parameters()) + list( + self.actor.parameters() + ) + critic_params = ( + list(self.shared_network.parameters()) + + list(self.critic1.parameters()) + + list(self.critic2.parameters()) + ) + return actor_params, critic_params def get_features(self, state: torch.Tensor): """Extract features from the state, which is then an input to get_action and get_value @@ -313,7 +340,7 @@ def get_features(self, state: torch.Tensor): Returns: features (:obj:`torch.Tensor`): The feature(s) extracted from the state """ - features = self.shared(state) + features = self.shared_network(state) return features def get_action(self, state: torch.Tensor, deterministic: bool = False): @@ -326,9 +353,9 @@ def get_action(self, state: torch.Tensor, deterministic: bool = False): Returns: action (:obj:`list`): List of actions as estimated by the critic distribution (): The distribution from which the action was sampled - (None if determinist + (None if deterministic) """ - return super(MlpSharedSingleActorMultiCritic, self).get_action( + return super(MlpSharedSingleActorTwoCritic, self).get_action( self.get_features(state), deterministic=deterministic ) @@ -346,9 +373,16 @@ def get_value(self, state: torch.Tensor, mode="first"): values (:obj:`list`): List of values as estimated by each individual critic """ state = torch.as_tensor(state).float() - x = self.get_features(state[:, :, :-1]) - state = torch.cat([x, state[:, :, -1].unsqueeze(-1)], dim=-1) - return super(MlpSharedSingleActorMultiCritic, self).get_value(state, mode) + # state shape = [batch_size, number of vec envs, (state_dim + action_dim)] + + # extract shard features for just the state + # state[:, :, :-action_dim] -> [batch_size, number of vec envs, state_dim] + x = self.get_features(state[:, :, : -self.action_dim]) + + # concatenate the actions to the extracted shared features + # state[:, :, -action_dim:] -> [batch_size, number of vec envs, action_dim] + state = torch.cat([x, state[:, :, -self.action_dim :]], dim=-1) + return super(MlpSharedSingleActorTwoCritic, self).get_value(state, mode) class CNNActorCritic(BaseActorCritic): @@ -438,9 +472,9 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor: actor_critic_registry = { "mlp": MlpActorCritic, "cnn": CNNActorCritic, - "mlp12": MlpSingleActorMultiCritic, + "mlp12": MlpSingleActorTwoCritic, "mlps": MlpSharedActorCritic, - "mlp12s": MlpSharedSingleActorMultiCritic, + "mlp12s": MlpSharedSingleActorTwoCritic, } From 43ed950245c0af7cb845d2b81319a242f52a5080 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sun, 6 Sep 2020 20:10:27 +0530 Subject: [PATCH 07/26] Remove compute_advantage from rollout buffer class --- genrl/agents/deep/a2c/a2c.py | 5 ++- genrl/agents/deep/ppo1/ppo1.py | 5 +-- genrl/agents/deep/vpg/vpg.py | 5 ++- genrl/core/__init__.py | 5 ++- genrl/core/rollout_storage.py | 63 ++++++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 5 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index 86cd84ad..50448c3b 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -7,6 +7,7 @@ import torch.optim as opt from genrl.agents.deep.base import OnPolicyAgent +from genrl.core import compute_returns_and_advantage from genrl.utils import get_env_properties, get_model, safe_mean @@ -126,7 +127,9 @@ def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.rollout.compute_returns_and_advantage(values.detach().cpu().numpy(), dones) + compute_returns_and_advantage( + self.rollout, values.detach().cpu().numpy(), dones + ) def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): """Evaluates actions taken by actor diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 7359e621..a3a70c80 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -7,6 +7,7 @@ import torch.optim as opt from genrl.agents import OnPolicyAgent +from genrl.core import compute_returns_and_advantage from genrl.utils import get_env_properties, get_model, safe_mean @@ -139,8 +140,8 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.rollout.compute_returns_and_advantage( - values.detach().cpu().numpy(), dones, use_gae=True + compute_returns_and_advantage( + self.rollout, values.detach().cpu().numpy(), dones, use_gae=True ) def update_params(self): diff --git a/genrl/agents/deep/vpg/vpg.py b/genrl/agents/deep/vpg/vpg.py index 88d9d1fa..1422ace6 100644 --- a/genrl/agents/deep/vpg/vpg.py +++ b/genrl/agents/deep/vpg/vpg.py @@ -6,6 +6,7 @@ import torch.optim as opt from genrl.agents import OnPolicyAgent +from genrl.core import compute_returns_and_advantage from genrl.utils import get_env_properties, get_model, safe_mean @@ -114,7 +115,9 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.rollout.compute_returns_and_advantage(values.detach().cpu().numpy(), dones) + compute_returns_and_advantage( + self.rollout, values.detach().cpu().numpy(), dones + ) def update_params(self) -> None: """Updates the the A2C network diff --git a/genrl/core/__init__.py b/genrl/core/__init__.py index 96824ac1..0e9e731e 100644 --- a/genrl/core/__init__.py +++ b/genrl/core/__init__.py @@ -15,7 +15,10 @@ MlpPolicy, get_policy_from_name, ) -from genrl.core.rollout_storage import RolloutBuffer # noqa +from genrl.core.rollout_storage import ( # noqa + RolloutBuffer, + compute_returns_and_advantage, +) from genrl.core.values import ( # noqa BaseValue, CnnCategoricalValue, diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index 078ca92a..17450d5a 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -318,3 +318,66 @@ def _get_samples(self, batch_inds: np.ndarray) -> RolloutBufferSamples: self.returns[batch_inds].flatten(), ) return RolloutBufferSamples(*tuple(map(self.to_torch, data))) + + +def compute_returns_and_advantage( + rollout_buffer: Union[RolloutBuffer, BaseBuffer], + last_value: torch.Tensor, + dones: np.ndarray, + use_gae: bool = False, +) -> None: + """ + Post-processing function: compute the returns (sum of discounted rewards) + and advantage (A(s) = R - V(S)). + Adapted from Stable-Baselines PPO2. + ;param rollout_buffer: (RolloutBuffer, BaseBuffer) An instance of the rollout buffer used in On-policy algorithms + :param last_value: (torch.Tensor) + :param dones: (np.ndarray) + :param use_gae: (bool) Whether to use Generalized Advantage Estimation + or normal advantage for advantage computation. + """ + last_value = last_value.flatten() + + if use_gae: + last_gae_lam = 0 + for step in reversed(range(rollout_buffer.buffer_size)): + if step == rollout_buffer.buffer_size - 1: + next_non_terminal = 1.0 - dones + next_value = last_value + else: + next_non_terminal = 1.0 - rollout_buffer.dones[step + 1] + next_value = rollout_buffer.values[step + 1] + delta = ( + rollout_buffer.rewards[step] + + rollout_buffer.gamma * next_value * next_non_terminal + - rollout_buffer.values[step] + ) + last_gae_lam = ( + delta + + rollout_buffer.gamma + * rollout_buffer.gae_lambda + * next_non_terminal + * last_gae_lam + ) + rollout_buffer.advantages[step] = last_gae_lam + rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values + else: + # Discounted return with value bootstrap + # Note: this is equivalent to GAE computation + # with gae_lambda = 1.0 + last_return = 0.0 + for step in reversed(range(rollout_buffer.buffer_size)): + if step == rollout_buffer.buffer_size - 1: + next_non_terminal = 1.0 - dones + next_value = last_value + last_return = ( + rollout_buffer.rewards[step] + next_non_terminal * next_value + ) + else: + next_non_terminal = 1.0 - rollout_buffer.dones[step + 1] + last_return = ( + rollout_buffer.rewards[step] + + rollout_buffer.gamma * last_return * next_non_terminal + ) + rollout_buffer.returns[step] = last_return + rollout_buffer.advantages = rollout_buffer.returns - rollout_buffer.values From 2f0a7491507a6fa56334de3eea12f40a4f66f64c Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sun, 6 Sep 2020 20:24:42 +0530 Subject: [PATCH 08/26] Remove duplication --- genrl/core/rollout_storage.py | 53 ----------------------------------- 1 file changed, 53 deletions(-) diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index 17450d5a..6700b4ae 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -195,59 +195,6 @@ def reset(self) -> None: self.generator_ready = False super(RolloutBuffer, self).reset() - def compute_returns_and_advantage( - self, last_value: torch.Tensor, dones: np.ndarray, use_gae: bool = False - ) -> None: - """ - Post-processing step: compute the returns (sum of discounted rewards) - and advantage (A(s) = R - V(S)). - Adapted from Stable-Baselines PPO2. - :param last_value: (torch.Tensor) - :param dones: (np.ndarray) - :param use_gae: (bool) Whether to use Generalized Advantage Estimation - or normal advantage for advantage computation. - """ - last_value = last_value.flatten() - - if use_gae: - last_gae_lam = 0 - for step in reversed(range(self.buffer_size)): - if step == self.buffer_size - 1: - next_non_terminal = 1.0 - dones - next_value = last_value - else: - next_non_terminal = 1.0 - self.dones[step + 1] - next_value = self.values[step + 1] - delta = ( - self.rewards[step] - + self.gamma * next_value * next_non_terminal - - self.values[step] - ) - last_gae_lam = ( - delta - + self.gamma * self.gae_lambda * next_non_terminal * last_gae_lam - ) - self.advantages[step] = last_gae_lam - self.returns = self.advantages + self.values - else: - # Discounted return with value bootstrap - # Note: this is equivalent to GAE computation - # with gae_lambda = 1.0 - last_return = 0.0 - for step in reversed(range(self.buffer_size)): - if step == self.buffer_size - 1: - next_non_terminal = 1.0 - dones - next_value = last_value - last_return = self.rewards[step] + next_non_terminal * next_value - else: - next_non_terminal = 1.0 - self.dones[step + 1] - last_return = ( - self.rewards[step] - + self.gamma * last_return * next_non_terminal - ) - self.returns[step] = last_return - self.advantages = self.returns - self.values - def add( self, obs: np.ndarray, From dc3dc182615c7f8e5d9dbac2886de6a80e063279 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 9 Sep 2020 03:01:19 +0530 Subject: [PATCH 09/26] unified gae and normal adv --- genrl/core/rollout_storage.py | 67 ++++++++++++++--------------------- 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index e1fa1a7c..c4bdf966 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -278,45 +278,30 @@ def compute_returns_and_advantage( last_value = last_value.flatten() if use_gae: - last_gae_lam = 0 - for step in reversed(range(rollout_buffer.buffer_size)): - if step == rollout_buffer.buffer_size - 1: - next_non_terminal = 1.0 - dones - next_value = last_value - else: - next_non_terminal = 1.0 - rollout_buffer.dones[step + 1] - next_value = rollout_buffer.values[step + 1] - delta = ( - rollout_buffer.rewards[step] - + rollout_buffer.gamma * next_value * next_non_terminal - - rollout_buffer.values[step] - ) - last_gae_lam = ( - delta - + rollout_buffer.gamma - * rollout_buffer.gae_lambda - * next_non_terminal - * last_gae_lam - ) - rollout_buffer.advantages[step] = last_gae_lam - rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values + gae_lambda = rollout_buffer.gae_lambda else: - # Discounted return with value bootstrap - # Note: this is equivalent to GAE computation - # with gae_lambda = 1.0 - last_return = 0.0 - for step in reversed(range(rollout_buffer.buffer_size)): - if step == rollout_buffer.buffer_size - 1: - next_non_terminal = 1.0 - dones - next_value = last_value - last_return = ( - rollout_buffer.rewards[step] + next_non_terminal * next_value - ) - else: - next_non_terminal = 1.0 - rollout_buffer.dones[step + 1] - last_return = ( - rollout_buffer.rewards[step] - + rollout_buffer.gamma * last_return * next_non_terminal - ) - rollout_buffer.returns[step] = last_return - rollout_buffer.advantages = rollout_buffer.returns - rollout_buffer.values + gae_lambda = 1 + + temp_dones = torch.cat( + [rollout_buffer.dones, torch.as_tensor(dones).unsqueeze(0)], dim=0 + ) + temp_values = torch.cat( + [rollout_buffer.values, torch.as_tensor(last_value).unsqueeze(0)], dim=0 + ) + + running_advantage = 0.0 + for step in reversed(range(rollout_buffer.buffer_size)): + next_non_terminal = 1 - temp_dones[step + 1] + next_values = temp_values[step + 1] + delta = ( + rollout_buffer.rewards[step] + + rollout_buffer.gamma * next_non_terminal * next_values + - temp_values[step] + ) + running_advantage = ( + delta + + rollout_buffer.gamma * gae_lambda * next_non_terminal * running_advantage + ) + rollout_buffer.advantages[step] = running_advantage + + rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values From 1a802fc96c1038056e6820a3f5521eab52a38a63 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 9 Sep 2020 16:35:29 +0530 Subject: [PATCH 10/26] Shift fn to OnPolicyAgent --- genrl/agents/deep/a2c/a2c.py | 5 ++-- genrl/agents/deep/base/onpolicy.py | 47 +++++++++++++++++++++++++++++ genrl/agents/deep/ppo1/ppo1.py | 5 +--- genrl/agents/deep/vpg/vpg.py | 5 ++-- genrl/core/__init__.py | 5 +--- genrl/core/rollout_storage.py | 48 ------------------------------ 6 files changed, 53 insertions(+), 62 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index ff6706a9..fb0d039a 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -6,7 +6,6 @@ from torch.nn import functional as F from genrl.agents.deep.base import OnPolicyAgent -from genrl.core import compute_returns_and_advantage from genrl.utils import get_env_properties, get_model, safe_mean @@ -124,8 +123,8 @@ def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - compute_returns_and_advantage( - self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() + self.compute_returns_and_advantage( + values.detach().cpu().numpy(), dones.cpu().numpy() ) def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): diff --git a/genrl/agents/deep/base/onpolicy.py b/genrl/agents/deep/base/onpolicy.py index 0c83cdfd..4db76bee 100644 --- a/genrl/agents/deep/base/onpolicy.py +++ b/genrl/agents/deep/base/onpolicy.py @@ -1,3 +1,4 @@ +import numpy as np import torch from genrl.agents.deep.base import BaseAgent @@ -94,3 +95,49 @@ def collect_rollouts(self, state: torch.Tensor): self.collect_rewards(dones, i) return values, dones + + def compute_returns_and_advantage( + self, + last_value: torch.Tensor, + dones: np.ndarray, + use_gae: bool = False, + ) -> None: + """ + Post-processing function: compute the returns (sum of discounted rewards) + and advantage (A(s) = R - V(S)). + Adapted from Stable-Baselines PPO2. + ;param rollout_buffer: (RolloutBuffer, BaseBuffer) An instance of the rollout buffer used in On-policy algorithms + :param last_value: (torch.Tensor) + :param dones: (np.ndarray) + :param use_gae: (bool) Whether to use Generalized Advantage Estimation + or normal advantage for advantage computation. + """ + last_value = last_value.flatten() + + if use_gae: + gae_lambda = self.rollout.gae_lambda + else: + gae_lambda = 1 + + next_values = last_value + next_non_terminal = 1 - dones + + running_advantage = 0.0 + for step in reversed(range(self.rollout.buffer_size)): + delta = ( + self.rollout.rewards[step] + + self.rollout.gamma * next_non_terminal * next_values + - self.rollout.values[step] + ) + running_advantage = ( + delta + + self.rollout.gamma + * gae_lambda + * next_non_terminal + * running_advantage + ) + next_non_terminal = 1 - self.rollout.dones[step] + next_values = self.rollout.values[step] + self.rollout.advantages[step] = running_advantage + + self.rollout.returns = self.rollout.advantages + self.rollout.values diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 00760363..a61e7229 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -6,7 +6,6 @@ import torch.optim as opt # noqa from genrl.agents import OnPolicyAgent -from genrl.core import compute_returns_and_advantage from genrl.utils import get_env_properties, get_model, safe_mean @@ -138,9 +137,7 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - print(type(dones)) - compute_returns_and_advantage( - self.rollout, + self.compute_returns_and_advantage( values.detach().cpu().numpy(), dones.cpu().numpy(), use_gae=True, diff --git a/genrl/agents/deep/vpg/vpg.py b/genrl/agents/deep/vpg/vpg.py index d81025f7..39de3dc6 100644 --- a/genrl/agents/deep/vpg/vpg.py +++ b/genrl/agents/deep/vpg/vpg.py @@ -5,7 +5,6 @@ import torch.optim as opt from genrl.agents import OnPolicyAgent -from genrl.core import compute_returns_and_advantage from genrl.utils import get_env_properties, get_model, safe_mean @@ -112,8 +111,8 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - compute_returns_and_advantage( - self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() + self.compute_returns_and_advantage( + values.detach().cpu().numpy(), dones.cpu().numpy() ) def update_params(self) -> None: diff --git a/genrl/core/__init__.py b/genrl/core/__init__.py index 0e9e731e..96824ac1 100644 --- a/genrl/core/__init__.py +++ b/genrl/core/__init__.py @@ -15,10 +15,7 @@ MlpPolicy, get_policy_from_name, ) -from genrl.core.rollout_storage import ( # noqa - RolloutBuffer, - compute_returns_and_advantage, -) +from genrl.core.rollout_storage import RolloutBuffer # noqa from genrl.core.values import ( # noqa BaseValue, CnnCategoricalValue, diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index c4bdf966..48679be4 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -257,51 +257,3 @@ def _get_samples(self, batch_inds: np.ndarray) -> RolloutBufferSamples: self.returns[batch_inds].flatten(), ) return RolloutBufferSamples(*tuple(map(self.to_torch, data))) - - -def compute_returns_and_advantage( - rollout_buffer: Union[RolloutBuffer, BaseBuffer], - last_value: torch.Tensor, - dones: np.ndarray, - use_gae: bool = False, -) -> None: - """ - Post-processing function: compute the returns (sum of discounted rewards) - and advantage (A(s) = R - V(S)). - Adapted from Stable-Baselines PPO2. - ;param rollout_buffer: (RolloutBuffer, BaseBuffer) An instance of the rollout buffer used in On-policy algorithms - :param last_value: (torch.Tensor) - :param dones: (np.ndarray) - :param use_gae: (bool) Whether to use Generalized Advantage Estimation - or normal advantage for advantage computation. - """ - last_value = last_value.flatten() - - if use_gae: - gae_lambda = rollout_buffer.gae_lambda - else: - gae_lambda = 1 - - temp_dones = torch.cat( - [rollout_buffer.dones, torch.as_tensor(dones).unsqueeze(0)], dim=0 - ) - temp_values = torch.cat( - [rollout_buffer.values, torch.as_tensor(last_value).unsqueeze(0)], dim=0 - ) - - running_advantage = 0.0 - for step in reversed(range(rollout_buffer.buffer_size)): - next_non_terminal = 1 - temp_dones[step + 1] - next_values = temp_values[step + 1] - delta = ( - rollout_buffer.rewards[step] - + rollout_buffer.gamma * next_non_terminal * next_values - - temp_values[step] - ) - running_advantage = ( - delta - + rollout_buffer.gamma * gae_lambda * next_non_terminal * running_advantage - ) - rollout_buffer.advantages[step] = running_advantage - - rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values From 6609952c68482c3d837a20fd828a2b8f282f557a Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 9 Sep 2020 22:19:46 +0530 Subject: [PATCH 11/26] Remove redundant line --- genrl/agents/deep/base/onpolicy.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/genrl/agents/deep/base/onpolicy.py b/genrl/agents/deep/base/onpolicy.py index 4db76bee..21ed8e5c 100644 --- a/genrl/agents/deep/base/onpolicy.py +++ b/genrl/agents/deep/base/onpolicy.py @@ -130,11 +130,7 @@ def compute_returns_and_advantage( - self.rollout.values[step] ) running_advantage = ( - delta - + self.rollout.gamma - * gae_lambda - * next_non_terminal - * running_advantage + delta + self.rollout.gamma * gae_lambda * running_advantage ) next_non_terminal = 1 - self.rollout.dones[step] next_values = self.rollout.values[step] From 6453ee010a6c986fef0bb3d37aeafdb58dd2e6f4 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Wed, 9 Sep 2020 23:32:28 +0530 Subject: [PATCH 12/26] New file distributed.py --- genrl/agents/deep/a2c/a2c.py | 11 +++++-- genrl/agents/deep/base/onpolicy.py | 42 -------------------------- genrl/agents/deep/ppo1/ppo1.py | 10 +++++-- genrl/agents/deep/vpg/vpg.py | 11 +++++-- genrl/utils/__init__.py | 1 + genrl/utils/discount.py | 47 ++++++++++++++++++++++++++++++ 6 files changed, 72 insertions(+), 50 deletions(-) create mode 100644 genrl/utils/discount.py diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index fb0d039a..b3ae27ca 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -6,7 +6,12 @@ from torch.nn import functional as F from genrl.agents.deep.base import OnPolicyAgent -from genrl.utils import get_env_properties, get_model, safe_mean +from genrl.utils import ( + compute_returns_and_advantage, + get_env_properties, + get_model, + safe_mean, +) class A2C(OnPolicyAgent): @@ -123,8 +128,8 @@ def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.compute_returns_and_advantage( - values.detach().cpu().numpy(), dones.cpu().numpy() + compute_returns_and_advantage( + self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() ) def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): diff --git a/genrl/agents/deep/base/onpolicy.py b/genrl/agents/deep/base/onpolicy.py index 21ed8e5c..ee227dad 100644 --- a/genrl/agents/deep/base/onpolicy.py +++ b/genrl/agents/deep/base/onpolicy.py @@ -95,45 +95,3 @@ def collect_rollouts(self, state: torch.Tensor): self.collect_rewards(dones, i) return values, dones - - def compute_returns_and_advantage( - self, - last_value: torch.Tensor, - dones: np.ndarray, - use_gae: bool = False, - ) -> None: - """ - Post-processing function: compute the returns (sum of discounted rewards) - and advantage (A(s) = R - V(S)). - Adapted from Stable-Baselines PPO2. - ;param rollout_buffer: (RolloutBuffer, BaseBuffer) An instance of the rollout buffer used in On-policy algorithms - :param last_value: (torch.Tensor) - :param dones: (np.ndarray) - :param use_gae: (bool) Whether to use Generalized Advantage Estimation - or normal advantage for advantage computation. - """ - last_value = last_value.flatten() - - if use_gae: - gae_lambda = self.rollout.gae_lambda - else: - gae_lambda = 1 - - next_values = last_value - next_non_terminal = 1 - dones - - running_advantage = 0.0 - for step in reversed(range(self.rollout.buffer_size)): - delta = ( - self.rollout.rewards[step] - + self.rollout.gamma * next_non_terminal * next_values - - self.rollout.values[step] - ) - running_advantage = ( - delta + self.rollout.gamma * gae_lambda * running_advantage - ) - next_non_terminal = 1 - self.rollout.dones[step] - next_values = self.rollout.values[step] - self.rollout.advantages[step] = running_advantage - - self.rollout.returns = self.rollout.advantages + self.rollout.values diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index a61e7229..fed0547e 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -6,7 +6,12 @@ import torch.optim as opt # noqa from genrl.agents import OnPolicyAgent -from genrl.utils import get_env_properties, get_model, safe_mean +from genrl.utils import ( + compute_returns_and_advantage, + get_env_properties, + get_model, + safe_mean, +) class PPO1(OnPolicyAgent): @@ -137,7 +142,8 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.compute_returns_and_advantage( + compute_returns_and_advantage( + self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy(), use_gae=True, diff --git a/genrl/agents/deep/vpg/vpg.py b/genrl/agents/deep/vpg/vpg.py index 39de3dc6..63af618d 100644 --- a/genrl/agents/deep/vpg/vpg.py +++ b/genrl/agents/deep/vpg/vpg.py @@ -5,7 +5,12 @@ import torch.optim as opt from genrl.agents import OnPolicyAgent -from genrl.utils import get_env_properties, get_model, safe_mean +from genrl.utils import ( + compute_returns_and_advantage, + get_env_properties, + get_model, + safe_mean, +) class VPG(OnPolicyAgent): @@ -111,8 +116,8 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.compute_returns_and_advantage( - values.detach().cpu().numpy(), dones.cpu().numpy() + compute_returns_and_advantage( + self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() ) def update_params(self) -> None: diff --git a/genrl/utils/__init__.py b/genrl/utils/__init__.py index fdf27737..b7f4070d 100644 --- a/genrl/utils/__init__.py +++ b/genrl/utils/__init__.py @@ -7,6 +7,7 @@ MushroomDataBandit, StatlogDataBandit, ) +from genrl.utils.discount import compute_returns_and_advantage # noqa from genrl.utils.logger import CSVLogger # noqa from genrl.utils.logger import HumanOutputFormat # noqa from genrl.utils.logger import Logger # noqa diff --git a/genrl/utils/discount.py b/genrl/utils/discount.py new file mode 100644 index 00000000..e934a890 --- /dev/null +++ b/genrl/utils/discount.py @@ -0,0 +1,47 @@ +from typing import Union + +import numpy as np +import torch + + +def compute_returns_and_advantage( + rollout_buffer, + last_value: torch.Tensor, + dones: np.ndarray, + use_gae: bool = False, +) -> None: + """ + Post-processing function: compute the returns (sum of discounted rewards) + and advantage (A(s) = R - V(S)). + Adapted from Stable-Baselines PPO2. + ;param rollout_buffer: (RolloutBuffer, BaseBuffer) An instance of the rollout buffer used in On-policy algorithms + :param last_value: (torch.Tensor) + :param dones: (np.ndarray) + :param use_gae: (bool) Whether to use Generalized Advantage Estimation + or normal advantage for advantage computation. + """ + last_value = last_value.flatten() + + if use_gae: + gae_lambda = rollout_buffer.gae_lambda + else: + gae_lambda = 1 + + next_values = last_value + next_non_terminal = 1 - dones + + running_advantage = 0.0 + for step in reversed(range(rollout_buffer.buffer_size)): + delta = ( + rollout_buffer.rewards[step] + + rollout_buffer.gamma * next_non_terminal * next_values + - rollout_buffer.values[step] + ) + running_advantage = ( + delta + rollout_buffer.gamma * gae_lambda * running_advantage + ) + next_non_terminal = 1 - rollout_buffer.dones[step] + next_values = rollout_buffer.values[step] + rollout_buffer.advantages[step] = running_advantage + + rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values From 3160e73233225b387c12a171e4f1b4b2cba49a7e Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Thu, 10 Sep 2020 23:05:59 +0530 Subject: [PATCH 13/26] Fix LGTM --- genrl/agents/deep/base/onpolicy.py | 1 - genrl/utils/discount.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/genrl/agents/deep/base/onpolicy.py b/genrl/agents/deep/base/onpolicy.py index ee227dad..0c83cdfd 100644 --- a/genrl/agents/deep/base/onpolicy.py +++ b/genrl/agents/deep/base/onpolicy.py @@ -1,4 +1,3 @@ -import numpy as np import torch from genrl.agents.deep.base import BaseAgent diff --git a/genrl/utils/discount.py b/genrl/utils/discount.py index e934a890..2de2e521 100644 --- a/genrl/utils/discount.py +++ b/genrl/utils/discount.py @@ -1,11 +1,11 @@ -from typing import Union +from typing import Any import numpy as np import torch def compute_returns_and_advantage( - rollout_buffer, + rollout_buffer: Any, last_value: torch.Tensor, dones: np.ndarray, use_gae: bool = False, From 6d8ed2dfee3a4ff9f3a13958e198ec9c087dd4ff Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Fri, 11 Sep 2020 20:17:17 +0530 Subject: [PATCH 14/26] Docstring --- genrl/utils/discount.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/genrl/utils/discount.py b/genrl/utils/discount.py index 2de2e521..54d8754d 100644 --- a/genrl/utils/discount.py +++ b/genrl/utils/discount.py @@ -14,11 +14,15 @@ def compute_returns_and_advantage( Post-processing function: compute the returns (sum of discounted rewards) and advantage (A(s) = R - V(S)). Adapted from Stable-Baselines PPO2. - ;param rollout_buffer: (RolloutBuffer, BaseBuffer) An instance of the rollout buffer used in On-policy algorithms - :param last_value: (torch.Tensor) - :param dones: (np.ndarray) - :param use_gae: (bool) Whether to use Generalized Advantage Estimation - or normal advantage for advantage computation. + + Args: + rollout_buffer: An instance of the rollout buffer used for OnPolicy Agents + last_value: (:obj: torch.tensor) + dones: (:obj: np.ndarray) + use_gae: (bool) True if Generalized Advantage Estimation is to be used, else False + + Returns: + A modified Rollout Buffer with advantages calculated """ last_value = last_value.flatten() From bf71710a386b4e76ce2d26e83531f68daa5cbf55 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sat, 12 Sep 2020 13:00:52 +0530 Subject: [PATCH 15/26] Adding tutorial --- .isort.cfg | 2 +- .pre-commit-config.yaml | 2 +- ...ared parameters in actor critic agents.rst | 70 +++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst diff --git a/.isort.cfg b/.isort.cfg index db6b8351..4b0feff5 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,5 +1,5 @@ [settings] -known_third_party = cv2,gym,matplotlib,numpy,pandas,pytest,scipy,setuptools,torch +known_third_party = cv2,gym,matplotlib,numpy,pandas,pytest,scipy,setuptools,toml,torch multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c68e57a..990fb2a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: args: [--exclude=^((examples|docs)/.*)$] - repo: https://github.com/timothycrosley/isort - rev: 4.3.2 + rev: 5.4.2 hooks: - id: isort diff --git a/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst b/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst new file mode 100644 index 00000000..70a7b764 --- /dev/null +++ b/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst @@ -0,0 +1,70 @@ +Using Shared Parameters in Actor Critic Agents in GenRL +======================================================= + +The Actor Critic Agents use two networks, an Actor network to select an action to be taken in the current state, and a +critic network, to estimate the value of the state the agent is currently in. There are two common ways to implement +this actor critic architecture. + +The first method - Indpendent Actor and critic networks - + +.. code-block:: none + + state + / \ + + / \ + action value + +And the second method - Using a set of shared parameters to extract a feature vector from the state. The actor and the +critic network act on this feature vector to select an action and estimate the value + +.. code-block:: none + + state + | + + / \ + + / \ + action value + +GenRL provides support to incorporte this decoder network in all of the actor critic agents through a ``shared_layers`` +parameter. ``shared_layers`` takes the sizes of the mlp layers o be used, and ``None`` if no decoder network is to be +used + +As an example - in A2C - +.. code-block:: python +# The imports +from genrl.agents import A2C +from genrl.environments import VectorEnv +from genrl.trainers import OnPolicyTrainer + +# Initializing the environment +env = VectorEnv("CartPole-v0", 1) + +# Initializing the agent to be used +algo = A2C( + "mlp", + env, + policy_layers=(128,), + value_layers=(128,), + shared_layers=(32, 64), + rollout_size=128, + ) + +# Finally initializing the trainer and trainer +trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) +trainer.train() + +The above example uses and mlp of layer sizes (32, 64) as the decoder, and can be visualised as follows - +.. code-block:: none + + state + | + <32> + | + <64> + / \ + <128> <128> + / \ + action value \ No newline at end of file From fc356b9aaa660f72e5566fc3e76cec29076920d8 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sat, 12 Sep 2020 13:03:10 +0530 Subject: [PATCH 16/26] Small change --- .../Using shared parameters in actor critic agents.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst b/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst index 70a7b764..82eb12b7 100644 --- a/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst +++ b/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst @@ -33,6 +33,7 @@ parameter. ``shared_layers`` takes the sizes of the mlp layers o be used, and `` used As an example - in A2C - + .. code-block:: python # The imports from genrl.agents import A2C @@ -57,6 +58,7 @@ trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1 trainer.train() The above example uses and mlp of layer sizes (32, 64) as the decoder, and can be visualised as follows - + .. code-block:: none state From 844c53da24fc4d65aa438cb8538f2bfec5498046 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sun, 13 Sep 2020 11:52:26 +0530 Subject: [PATCH 17/26] Index --- docs/source/usage/tutorials/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/usage/tutorials/index.rst b/docs/source/usage/tutorials/index.rst index f0f4cb63..85875f44 100644 --- a/docs/source/usage/tutorials/index.rst +++ b/docs/source/usage/tutorials/index.rst @@ -9,5 +9,6 @@ Tutorials Deep/index Using Custom Policies Using A2C + Using shared parameters in actor critic agents using_vpg Saving and loading From d15b9db1def5476a9703f7d6182e7cf9b39184f4 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Sun, 11 Oct 2020 00:46:15 +0530 Subject: [PATCH 18/26] Adding return statement --- genrl/agents/deep/a2c/a2c.py | 2 +- genrl/agents/deep/vpg/vpg.py | 2 +- genrl/utils/discount.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index 633b005c..316e1511 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -128,7 +128,7 @@ def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - compute_returns_and_advantage( + self.rollout = compute_returns_and_advantage( self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() ) diff --git a/genrl/agents/deep/vpg/vpg.py b/genrl/agents/deep/vpg/vpg.py index 2d6a6ef9..812eef02 100644 --- a/genrl/agents/deep/vpg/vpg.py +++ b/genrl/agents/deep/vpg/vpg.py @@ -116,7 +116,7 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - compute_returns_and_advantage( + self.rollout = compute_returns_and_advantage( self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() ) diff --git a/genrl/utils/discount.py b/genrl/utils/discount.py index 54d8754d..ec728de8 100644 --- a/genrl/utils/discount.py +++ b/genrl/utils/discount.py @@ -49,3 +49,5 @@ def compute_returns_and_advantage( rollout_buffer.advantages[step] = running_advantage rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values + + return rollout_buffer From ca28fa4a1fb8b0701199a72ae8dd13e3fc4bb174 Mon Sep 17 00:00:00 2001 From: Rishabh Patra Date: Tue, 13 Oct 2020 01:28:34 +0530 Subject: [PATCH 19/26] Fix discount.py --- genrl/agents/deep/a2c/a2c.py | 2 +- genrl/agents/deep/vpg/vpg.py | 2 +- genrl/utils/discount.py | 15 ++++++++------- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index 316e1511..e2b49c57 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -128,7 +128,7 @@ def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.rollout = compute_returns_and_advantage( + self.rollout.returns, self.rollout.advantages = compute_returns_and_advantage( self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() ) diff --git a/genrl/agents/deep/vpg/vpg.py b/genrl/agents/deep/vpg/vpg.py index 812eef02..3b34d9d3 100644 --- a/genrl/agents/deep/vpg/vpg.py +++ b/genrl/agents/deep/vpg/vpg.py @@ -116,7 +116,7 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - self.rollout = compute_returns_and_advantage( + self.rollout.returns, self.rollout.advantages = compute_returns_and_advantage( self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() ) diff --git a/genrl/utils/discount.py b/genrl/utils/discount.py index ec728de8..aeec56ba 100644 --- a/genrl/utils/discount.py +++ b/genrl/utils/discount.py @@ -29,25 +29,26 @@ def compute_returns_and_advantage( if use_gae: gae_lambda = rollout_buffer.gae_lambda else: - gae_lambda = 1 + gae_lambda = 1.0 - next_values = last_value - next_non_terminal = 1 - dones + next_value = last_value + next_non_terminal = 1.0 - dones running_advantage = 0.0 for step in reversed(range(rollout_buffer.buffer_size)): delta = ( rollout_buffer.rewards[step] - + rollout_buffer.gamma * next_non_terminal * next_values + + rollout_buffer.gamma * next_value * next_non_terminal - rollout_buffer.values[step] ) running_advantage = ( - delta + rollout_buffer.gamma * gae_lambda * running_advantage + delta + + rollout_buffer.gamma * gae_lambda * next_non_terminal * running_advantage ) next_non_terminal = 1 - rollout_buffer.dones[step] - next_values = rollout_buffer.values[step] + next_value = rollout_buffer.values[step] rollout_buffer.advantages[step] = running_advantage rollout_buffer.returns = rollout_buffer.advantages + rollout_buffer.values - return rollout_buffer + return rollout_buffer.returns, rollout_buffer.advantages From 29b6134c51bb007e90a4d0399353b8bc7b3054a6 Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Tue, 24 Nov 2020 01:56:40 +0530 Subject: [PATCH 20/26] CUDA support for onp agents --- ...ared parameters in actor critic agents.rst | 28 -------- genrl/agents/deep/base/onpolicy.py | 2 +- tests/test_deep/test_agents/test_a2c.py | 29 -------- tests/test_deep/test_agents/test_custom.py | 72 ------------------- tests/test_deep/test_agents/test_ddpg.py | 57 --------------- tests/test_deep/test_agents/test_ppo1.py | 29 -------- tests/test_deep/test_agents/test_sac.py | 48 ------------- tests/test_deep/test_agents/test_td3.py | 57 --------------- 8 files changed, 1 insertion(+), 321 deletions(-) delete mode 100644 tests/test_deep/test_agents/test_a2c.py delete mode 100644 tests/test_deep/test_agents/test_custom.py delete mode 100644 tests/test_deep/test_agents/test_ddpg.py delete mode 100644 tests/test_deep/test_agents/test_ppo1.py delete mode 100644 tests/test_deep/test_agents/test_sac.py delete mode 100644 tests/test_deep/test_agents/test_td3.py diff --git a/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst b/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst index 0cecb2ec..bec17a41 100644 --- a/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst +++ b/docs/source/usage/tutorials/Using shared parameters in actor critic agents.rst @@ -29,39 +29,12 @@ critic network act on this feature vector to select an action and estimate the v action value GenRL provides support to incorporte this decoder network in all of the actor critic agents through a ``shared_layers`` -<<<<<<< HEAD -parameter. ``shared_layers`` takes the sizes of the mlp layers o be used, and ``None`` if no decoder network is to be -======= parameter. ``shared_layers`` takes the sizes of the mlp layers to be used, and ``None`` if no decoder network is to be ->>>>>>> 25eb018f18a9a1d0865c16e5233a2a7ccddbfd78 used As an example - in A2C - .. code-block:: python -<<<<<<< HEAD -# The imports -from genrl.agents import A2C -from genrl.environments import VectorEnv -from genrl.trainers import OnPolicyTrainer - -# Initializing the environment -env = VectorEnv("CartPole-v0", 1) - -# Initializing the agent to be used -algo = A2C( - "mlp", - env, - policy_layers=(128,), - value_layers=(128,), - shared_layers=(32, 64), - rollout_size=128, - ) - -# Finally initializing the trainer and trainer -trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) -trainer.train() -======= # The imports from genrl.agents import A2C @@ -84,7 +57,6 @@ trainer.train() # Finally initializing the trainer and trainer trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) trainer.train() ->>>>>>> 25eb018f18a9a1d0865c16e5233a2a7ccddbfd78 The above example uses and mlp of layer sizes (32, 64) as the decoder, and can be visualised as follows - diff --git a/genrl/agents/deep/base/onpolicy.py b/genrl/agents/deep/base/onpolicy.py index 0c83cdfd..d815e22b 100644 --- a/genrl/agents/deep/base/onpolicy.py +++ b/genrl/agents/deep/base/onpolicy.py @@ -73,7 +73,7 @@ def collect_rollouts(self, state: torch.Tensor): dones (:obj:`torch.Tensor`): Game over statuses of each environment """ for i in range(self.rollout_size): - action, values, old_log_probs = self.select_action(state) + action, values, old_log_probs = self.select_action(state.to(self.device)) next_state, reward, dones, _ = self.env.step(action) diff --git a/tests/test_deep/test_agents/test_a2c.py b/tests/test_deep/test_agents/test_a2c.py deleted file mode 100644 index f731f40f..00000000 --- a/tests/test_deep/test_agents/test_a2c.py +++ /dev/null @@ -1,29 +0,0 @@ -import shutil - -from genrl.agents import A2C -from genrl.environments import VectorEnv -from genrl.trainers import OnPolicyTrainer - - -def test_a2c(): - env = VectorEnv("CartPole-v0", 1) - algo = A2C("mlp", env, rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") - - -def test_a2c_cnn(): - env = VectorEnv("Pong-v0", 1, env_type="atari") - algo = A2C("cnn", env, rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") - - -def test_a2c_shared(): - env = VectorEnv("CartPole-v0", 1) - algo = A2C("mlp", env, shared_layers=(32, 32), rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_custom.py b/tests/test_deep/test_agents/test_custom.py deleted file mode 100644 index a0c97063..00000000 --- a/tests/test_deep/test_agents/test_custom.py +++ /dev/null @@ -1,72 +0,0 @@ -import shutil - -from genrl.agents import PPO1, TD3, VPG -from genrl.core import ( - MlpActorCritic, - MlpPolicy, - MlpValue, - NormalActionNoise, - OrnsteinUhlenbeckActionNoise, -) -from genrl.environments import VectorEnv -from genrl.trainers import OffPolicyTrainer, OnPolicyTrainer - - -class custom_policy(MlpPolicy): - def __init__(self, state_dim, action_dim, policy_layers=(1, 1), **kwargs): - super(custom_policy, self).__init__( - state_dim, action_dim, policy_layers=policy_layers, **kwargs - ) - - -class custom_actorcritic(MlpActorCritic): - def __init__( - self, - state_dim, - action_dim, - shared_layers=None, - policy_layers=(1, 1), - value_layers=(1, 1), - val_type="V", - **kwargs - ): - super(custom_actorcritic, self).__init__( - state_dim, - action_dim, - shared_layers=shared_layers, - policy_layers=policy_layers, - value_layers=value_layers, - val_type=val_type, - **kwargs - ) - - def get_params(self): - actor_params = self.actor.parameters() - critic_params = self.critic.parameters() - return actor_params, critic_params - - -def test_custom_vpg(): - env = VectorEnv("CartPole-v0", 1) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - policy = custom_policy(state_dim, action_dim) - - algo = VPG(policy, env) - - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") - - -def test_custom_ppo1(): - env = VectorEnv("CartPole-v0", 1) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - actorcritic = custom_actorcritic(state_dim, action_dim) - - algo = PPO1(actorcritic, env) - - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_ddpg.py b/tests/test_deep/test_agents/test_ddpg.py deleted file mode 100644 index 94670cef..00000000 --- a/tests/test_deep/test_agents/test_ddpg.py +++ /dev/null @@ -1,57 +0,0 @@ -import shutil - -from genrl.agents import DDPG -from genrl.core import NormalActionNoise -from genrl.environments import VectorEnv -from genrl.trainers import OffPolicyTrainer - - -def test_ddpg(): - env = VectorEnv("Pendulum-v0", 2) - algo = DDPG( - "mlp", - env, - batch_size=5, - noise=NormalActionNoise, - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=4, - max_ep_len=200, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") - - -def test_ddpg_shared(): - env = VectorEnv("Pendulum-v0", 2) - algo = DDPG( - "mlp", - env, - batch_size=5, - noise=NormalActionNoise, - shared_layers=[1, 1], - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=4, - max_ep_len=200, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_ppo1.py b/tests/test_deep/test_agents/test_ppo1.py deleted file mode 100644 index 97d40791..00000000 --- a/tests/test_deep/test_agents/test_ppo1.py +++ /dev/null @@ -1,29 +0,0 @@ -import shutil - -from genrl.agents import PPO1 -from genrl.environments import VectorEnv -from genrl.trainers import OnPolicyTrainer - - -def test_ppo1(): - env = VectorEnv("CartPole-v0") - algo = PPO1("mlp", env, rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") - - -def test_ppo1_cnn(): - env = VectorEnv("Pong-v0", env_type="atari") - algo = PPO1("cnn", env, rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") - - -def test_ppo1_shared(): - env = VectorEnv("CartPole-v0") - algo = PPO1("mlp", env, shared_layers=[32, 32], rollout_size=128) - trainer = OnPolicyTrainer(algo, env, log_mode=["csv"], logdir="./logs", epochs=1) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_sac.py b/tests/test_deep/test_agents/test_sac.py deleted file mode 100644 index 3ea1cfee..00000000 --- a/tests/test_deep/test_agents/test_sac.py +++ /dev/null @@ -1,48 +0,0 @@ -import shutil - -from genrl.agents import SAC -from genrl.environments import VectorEnv -from genrl.trainers import OffPolicyTrainer - - -def test_sac(): - env = VectorEnv("Pendulum-v0", 2) - algo = SAC("mlp", env, batch_size=5, policy_layers=[1, 1], value_layers=[1, 1]) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=5, - max_ep_len=500, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") - - -def test_sac_shared(): - env = VectorEnv("Pendulum-v0", 2) - algo = SAC( - "mlp", - env, - batch_size=5, - shared_layers=[1, 1], - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=5, - max_ep_len=500, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") diff --git a/tests/test_deep/test_agents/test_td3.py b/tests/test_deep/test_agents/test_td3.py deleted file mode 100644 index 35def46f..00000000 --- a/tests/test_deep/test_agents/test_td3.py +++ /dev/null @@ -1,57 +0,0 @@ -import shutil - -from genrl.agents import TD3 -from genrl.core import OrnsteinUhlenbeckActionNoise -from genrl.environments import VectorEnv -from genrl.trainers import OffPolicyTrainer - - -def test_td3(): - env = VectorEnv("Pendulum-v0", 2) - algo = TD3( - "mlp", - env, - batch_size=5, - noise=OrnsteinUhlenbeckActionNoise, - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=5, - max_ep_len=500, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") - - -def test_td3_shared(): - env = VectorEnv("Pendulum-v0", 2) - algo = TD3( - "mlp", - env, - batch_size=5, - noise=OrnsteinUhlenbeckActionNoise, - shared_layers=[1, 1], - policy_layers=[1, 1], - value_layers=[1, 1], - ) - - trainer = OffPolicyTrainer( - algo, - env, - log_mode=["csv"], - logdir="./logs", - epochs=5, - max_ep_len=500, - warmup_steps=10, - start_update=10, - ) - trainer.train() - shutil.rmtree("./logs") From 5b915dd35a202f08b43814a30e09523f4576242b Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Wed, 25 Nov 2020 15:16:43 +0530 Subject: [PATCH 21/26] All agents except DQN --- .pre-commit-config.yaml | 2 +- genrl/agents/deep/a2c/a2c.py | 8 ++--- genrl/agents/deep/base/offpolicy.py | 23 ++++++++++--- genrl/agents/deep/base/onpolicy.py | 2 +- genrl/agents/deep/ppo1/ppo1.py | 12 +++---- genrl/agents/deep/sac/sac.py | 20 +++++++---- genrl/agents/deep/td3/td3.py | 4 +-- genrl/agents/deep/vpg/vpg.py | 8 ++--- genrl/core/buffers.py | 5 +-- genrl/core/rollout_storage.py | 42 ++++++++++++++--------- genrl/trainers/offpolicy.py | 4 ++- tests/test_agents/test_bandit/__init__.py | 4 +-- 12 files changed, 83 insertions(+), 51 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1e772ecc..0fcfb45d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3.7 + language_version: python3.8 - repo: https://gitlab.com/pycqa/flake8 rev: 3.8.3 diff --git a/genrl/agents/deep/a2c/a2c.py b/genrl/agents/deep/a2c/a2c.py index 1bfa3a88..df258d00 100644 --- a/genrl/agents/deep/a2c/a2c.py +++ b/genrl/agents/deep/a2c/a2c.py @@ -118,7 +118,7 @@ def select_action( action, dist = self.ac.get_action(state, deterministic=deterministic) value = self.ac.get_value(state) - return action.detach(), value, dist.log_prob(action).cpu() + return action.detach(), value, dist.log_prob(action) def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: """Get loss from trajectory traversed by agent during rollouts @@ -130,7 +130,7 @@ def get_traj_loss(self, values: torch.Tensor, dones: torch.Tensor) -> None: dones (:obj:`list` of bool): Game over statuses of each environment """ self.rollout.returns, self.rollout.advantages = compute_returns_and_advantage( - self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() + self.rollout, values.detach(), dones.to(self.device) ) def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): @@ -150,7 +150,7 @@ def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): states, actions = states.to(self.device), actions.to(self.device) _, dist = self.ac.get_action(states, deterministic=False) values = self.ac.get_value(states) - return values, dist.log_prob(actions).cpu(), dist.entropy().cpu() + return values, dist.log_prob(actions), dist.entropy() def update_params(self) -> None: """Updates the the A2C network @@ -171,7 +171,7 @@ def update_params(self) -> None: policy_loss = -torch.mean(policy_loss) self.logs["policy_loss"].append(policy_loss.item()) - value_loss = self.value_coeff * F.mse_loss(rollout.returns, values.cpu()) + value_loss = self.value_coeff * F.mse_loss(rollout.returns, values) self.logs["value_loss"].append(torch.mean(value_loss).item()) entropy_loss = -torch.mean(entropy) # Change this to entropy diff --git a/genrl/agents/deep/base/offpolicy.py b/genrl/agents/deep/base/offpolicy.py index fa632e8a..0a473a07 100644 --- a/genrl/agents/deep/base/offpolicy.py +++ b/genrl/agents/deep/base/offpolicy.py @@ -47,6 +47,7 @@ def __init__( self.replay_buffer = PrioritizedBuffer(self.replay_size) else: raise NotImplementedError + # self.replay_buffer = self.replay_buffer.to(self.device) def update_params_before_select_action(self, timestep: int) -> None: """Update any parameters before selecting action like epsilon for decaying epsilon greedy @@ -107,6 +108,7 @@ def sample_from_buffer(self, beta: float = None): ) else: raise NotImplementedError + # print(batch.device) return batch def get_q_loss(self, batch: collections.namedtuple) -> torch.Tensor: @@ -118,9 +120,13 @@ def get_q_loss(self, batch: collections.namedtuple) -> torch.Tensor: Returns: loss (:obj:`torch.Tensor`): Calculated loss of the Q-function """ - q_values = self.get_q_values(batch.states, batch.actions) + q_values = self.get_q_values( + batch.states.to(self.device), batch.actions.to(self.device) + ) target_q_values = self.get_target_q_values( - batch.next_states, batch.rewards, batch.dones + batch.next_states.to(self.device), + batch.rewards.to(self.device), + batch.dones.to(self.device), ) loss = F.mse_loss(q_values, target_q_values) return loss @@ -167,15 +173,16 @@ def select_action( Returns: action (:obj:`torch.Tensor`): Action taken by the agent """ + state = state.to(self.device) action, _ = self.ac.get_action(state, deterministic) action = action.detach() # add noise to output from policy network if self.noise is not None: - action += self.noise() + action += self.noise().to(self.device) return torch.clamp( - action, self.env.action_space.low[0], self.env.action_space.high[0] + action.cpu(), self.env.action_space.low[0], self.env.action_space.high[0] ) def update_target_model(self) -> None: @@ -199,6 +206,7 @@ def get_q_values(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Ten Returns: q_values (:obj:`torch.Tensor`): Q values for the given states and actions """ + states, actions = states.to(self.device), actions.to(self.device) if self.doublecritic: q_values = self.ac.get_value( torch.cat([states, actions], dim=-1), mode="both" @@ -221,6 +229,7 @@ def get_target_q_values( Returns: target_q_values (:obj:`torch.Tensor`): Target Q values for the TD3 """ + next_states = next_states.to(self.device) next_target_actions = self.ac_target.get_action(next_states, True)[0] if self.doublecritic: @@ -231,7 +240,10 @@ def get_target_q_values( next_q_target_values = self.ac_target.get_value( torch.cat([next_states, next_target_actions], dim=-1) ) - target_q_values = rewards + self.gamma * (1 - dones) * next_q_target_values + target_q_values = ( + rewards.to(self.device) + + self.gamma * (1 - dones.to(self.device)) * next_q_target_values + ) return target_q_values @@ -265,6 +277,7 @@ def get_p_loss(self, states: torch.Tensor) -> torch.Tensor: Returns: loss (:obj:`torch.Tensor`): Calculated policy loss """ + states = states.to(self.device) next_best_actions = self.ac.get_action(states, True)[0] q_values = self.ac.get_value(torch.cat([states, next_best_actions], dim=-1)) policy_loss = -torch.mean(q_values) diff --git a/genrl/agents/deep/base/onpolicy.py b/genrl/agents/deep/base/onpolicy.py index d815e22b..2eccdbec 100644 --- a/genrl/agents/deep/base/onpolicy.py +++ b/genrl/agents/deep/base/onpolicy.py @@ -36,7 +36,7 @@ def __init__( if buffer_type == "rollout": self.rollout = RolloutBuffer( - self.rollout_size, self.env, gae_lambda=gae_lambda + self.rollout_size, self.env, gae_lambda=gae_lambda, device=self.device ) else: raise NotImplementedError diff --git a/genrl/agents/deep/ppo1/ppo1.py b/genrl/agents/deep/ppo1/ppo1.py index 9d573069..4d721321 100644 --- a/genrl/agents/deep/ppo1/ppo1.py +++ b/genrl/agents/deep/ppo1/ppo1.py @@ -113,7 +113,7 @@ def select_action( action, dist = self.ac.get_action(state, deterministic=deterministic) value = self.ac.get_value(state) - return action.detach(), value, dist.log_prob(action).cpu() + return action.detach(), value, dist.log_prob(action) def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): """Evaluates actions taken by actor @@ -132,7 +132,7 @@ def evaluate_actions(self, states: torch.Tensor, actions: torch.Tensor): states, actions = states.to(self.device), actions.to(self.device) _, dist = self.ac.get_action(states, deterministic=False) values = self.ac.get_value(states) - return values, dist.log_prob(actions).cpu(), dist.entropy().cpu() + return values, dist.log_prob(actions), dist.entropy() def get_traj_loss(self, values, dones): """Get loss from trajectory traversed by agent during rollouts @@ -143,10 +143,10 @@ def get_traj_loss(self, values, dones): values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ - compute_returns_and_advantage( + self.rollout.returns, self.rollout.advantages = compute_returns_and_advantage( self.rollout, - values.detach().cpu().numpy(), - dones.cpu().numpy(), + values.detach(), + dones.to(self.device), use_gae=True, ) @@ -180,7 +180,7 @@ def update_params(self): values = values.flatten() value_loss = self.value_coeff * nn.functional.mse_loss( - rollout.returns, values.cpu() + rollout.returns, values ) self.logs["value_loss"].append(torch.mean(value_loss).item()) diff --git a/genrl/agents/deep/sac/sac.py b/genrl/agents/deep/sac/sac.py index ff93eeaa..5dc4253f 100644 --- a/genrl/agents/deep/sac/sac.py +++ b/genrl/agents/deep/sac/sac.py @@ -67,10 +67,10 @@ def _create_model(self, **kwargs) -> None: else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0 - ) + ).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0 - ) + ).to(self.device) if isinstance(self.network, str): state_dim, action_dim, discrete, _ = get_env_properties( @@ -89,7 +89,7 @@ def _create_model(self, **kwargs) -> None: sac=True, action_scale=self.action_scale, action_bias=self.action_bias, - ) + ).to(self.device) else: self.model = self.network @@ -102,7 +102,7 @@ def _create_model(self, **kwargs) -> None: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape) ).item() - self.log_alpha = torch.zeros(1, requires_grad=True) + self.log_alpha = torch.zeros(1, device=self.device, requires_grad=True) self.optimizer_alpha = opt.Adam([self.log_alpha], lr=self.lr_policy) def select_action( @@ -119,8 +119,9 @@ def select_action( Returns: action (:obj:`np.ndarray`): Action taken by the agent """ - action, _, _ = self.ac.get_action(state, deterministic) - return action.detach() + state = state.to(self.device) + action, _, _ = self.ac.get_action(state.to(self.device), deterministic) + return action.detach().cpu() def update_target_model(self) -> None: """Function to update the target Q model @@ -147,11 +148,15 @@ def get_target_q_values( Returns: target_q_values (:obj:`torch.Tensor`): Target Q values for the SAC """ + next_states = next_states.to(self.device) next_target_actions, next_log_probs, _ = self.ac.get_action(next_states) next_q_target_values = self.ac_target.get_value( torch.cat([next_states, next_target_actions], dim=-1), mode="min" ).squeeze() - self.alpha * next_log_probs.squeeze(1) - target_q_values = rewards + self.gamma * (1 - dones) * next_q_target_values + target_q_values = ( + rewards.to(self.device) + + self.gamma * (1 - dones.to(self.device)) * next_q_target_values + ) return target_q_values def get_p_loss(self, states: torch.Tensor) -> torch.Tensor: @@ -163,6 +168,7 @@ def get_p_loss(self, states: torch.Tensor) -> torch.Tensor: Returns: loss (:obj:`torch.Tensor`): Calculated policy loss """ + states = states.to(self.device) next_best_actions, log_probs, _ = self.ac.get_action(states) q_values = self.ac.get_value( torch.cat([states, next_best_actions], dim=-1), mode="min" diff --git a/genrl/agents/deep/td3/td3.py b/genrl/agents/deep/td3/td3.py index 88f8c074..5f3f1585 100644 --- a/genrl/agents/deep/td3/td3.py +++ b/genrl/agents/deep/td3/td3.py @@ -79,9 +79,9 @@ def _create_model(self) -> None: value_layers=self.value_layers, val_type="Qsa", discrete=False, - ) + ).to(self.device) else: - self.ac = self.network + self.ac = self.network.to(self.device) if self.noise is not None: self.noise = self.noise( diff --git a/genrl/agents/deep/vpg/vpg.py b/genrl/agents/deep/vpg/vpg.py index 3b34d9d3..a236880f 100644 --- a/genrl/agents/deep/vpg/vpg.py +++ b/genrl/agents/deep/vpg/vpg.py @@ -86,8 +86,8 @@ def select_action( return ( action.detach(), - torch.zeros((1, self.env.n_envs)), - dist.log_prob(action).cpu(), + torch.zeros((1, self.env.n_envs), device=self.device), + dist.log_prob(action), ) def get_log_probs(self, states: torch.Tensor, actions: torch.Tensor): @@ -105,7 +105,7 @@ def get_log_probs(self, states: torch.Tensor, actions: torch.Tensor): """ states, actions = states.to(self.device), actions.to(self.device) _, dist = self.actor.get_action(states, deterministic=False) - return dist.log_prob(actions).cpu() + return dist.log_prob(actions) def get_traj_loss(self, values, dones): """Get loss from trajectory traversed by agent during rollouts @@ -117,7 +117,7 @@ def get_traj_loss(self, values, dones): dones (:obj:`list` of bool): Game over statuses of each environment """ self.rollout.returns, self.rollout.advantages = compute_returns_and_advantage( - self.rollout, values.detach().cpu().numpy(), dones.cpu().numpy() + self.rollout, values.detach().to(self.device), dones.to(self.device) ) def update_params(self) -> None: diff --git a/genrl/core/buffers.py b/genrl/core/buffers.py index 0a5b6e7c..8c330487 100644 --- a/genrl/core/buffers.py +++ b/genrl/core/buffers.py @@ -32,9 +32,10 @@ class ReplayBuffer: :type capacity: int """ - def __init__(self, capacity: int): + def __init__(self, capacity: int, device="cpu"): self.capacity = capacity self.memory = deque([], maxlen=capacity) + self.device = device def push(self, inp: Tuple) -> None: """ @@ -60,7 +61,7 @@ def sample( batch = random.sample(self.memory, batch_size) state, action, reward, next_state, done = map(np.stack, zip(*batch)) return [ - torch.from_numpy(v).float() + torch.from_numpy(v).float().to(self.device) for v in [state, action, reward, next_state, done] ] diff --git a/genrl/core/rollout_storage.py b/genrl/core/rollout_storage.py index 16d1c721..9d13ecfd 100644 --- a/genrl/core/rollout_storage.py +++ b/genrl/core/rollout_storage.py @@ -133,8 +133,8 @@ def to_torch(self, array: np.ndarray, copy: bool = True) -> torch.Tensor: :return: (torch.Tensor) """ if copy: - return array.detach().clone() - return array + return array.detach().clone().to(self.device) + return array.to(self.device) class RolloutBuffer(BaseBuffer): @@ -173,17 +173,27 @@ def __init__( def reset(self) -> None: self.observations = torch.zeros( - *(self.buffer_size, self.env.n_envs, *self.env.obs_shape) + *(self.buffer_size, self.env.n_envs, *self.env.obs_shape), + device=self.device ) self.actions = torch.zeros( - *(self.buffer_size, self.env.n_envs, *self.env.action_shape) + *(self.buffer_size, self.env.n_envs, *self.env.action_shape), + device=self.device + ) + self.rewards = torch.zeros( + self.buffer_size, self.env.n_envs, device=self.device + ) + self.returns = torch.zeros( + self.buffer_size, self.env.n_envs, device=self.device + ) + self.dones = torch.zeros(self.buffer_size, self.env.n_envs, device=self.device) + self.values = torch.zeros(self.buffer_size, self.env.n_envs, device=self.device) + self.log_probs = torch.zeros( + self.buffer_size, self.env.n_envs, device=self.device + ) + self.advantages = torch.zeros( + self.buffer_size, self.env.n_envs, device=self.device ) - self.rewards = torch.zeros(self.buffer_size, self.env.n_envs) - self.returns = torch.zeros(self.buffer_size, self.env.n_envs) - self.dones = torch.zeros(self.buffer_size, self.env.n_envs) - self.values = torch.zeros(self.buffer_size, self.env.n_envs) - self.log_probs = torch.zeros(self.buffer_size, self.env.n_envs) - self.advantages = torch.zeros(self.buffer_size, self.env.n_envs) self.generator_ready = False super(RolloutBuffer, self).reset() @@ -210,12 +220,12 @@ def add( # Reshape 0-d tensor to avoid error log_prob = log_prob.reshape(-1, 1) - self.observations[self.pos] = obs.detach().clone() - self.actions[self.pos] = action.detach().clone() - self.rewards[self.pos] = reward.detach().clone() - self.dones[self.pos] = done.detach().clone() - self.values[self.pos] = value.detach().clone().flatten() - self.log_probs[self.pos] = log_prob.detach().clone().flatten() + self.observations[self.pos] = obs.detach().clone().to(self.device) + self.actions[self.pos] = action.detach().clone().to(self.device) + self.rewards[self.pos] = reward.detach().clone().to(self.device) + self.dones[self.pos] = done.detach().clone().to(self.device) + self.values[self.pos] = value.detach().clone().flatten().to(self.device) + self.log_probs[self.pos] = log_prob.detach().clone().flatten().to(self.device) self.pos += 1 if self.pos == self.buffer_size: self.full = True diff --git a/genrl/trainers/offpolicy.py b/genrl/trainers/offpolicy.py index 7e0571c2..a95324c2 100644 --- a/genrl/trainers/offpolicy.py +++ b/genrl/trainers/offpolicy.py @@ -65,7 +65,9 @@ def __init__( self.buffer = self.agent.replay_buffer def noise_reset(self) -> None: - """Resets the agent's action noise functions""" + """ + Reaseas + """ if "noise" in self.agent.__dict__ and self.agent.noise is not None: self.agent.noise.reset() diff --git a/tests/test_agents/test_bandit/__init__.py b/tests/test_agents/test_bandit/__init__.py index 4411dff3..8faedc3d 100644 --- a/tests/test_agents/test_bandit/__init__.py +++ b/tests/test_agents/test_bandit/__init__.py @@ -1,6 +1,6 @@ from tests.test_agents.test_bandit.test_cb_agents import TestCBAgent # noqa from tests.test_agents.test_bandit.test_data_bandits import TestDataBandit # noqa from tests.test_agents.test_bandit.test_mab_agents import TestMABAgent # noqa -from tests.test_agents.test_bandit.test_multi_armed_bandits import ( - TestMultiArmedBandit, # noqa +from tests.test_agents.test_bandit.test_multi_armed_bandits import ( # noqa + TestMultiArmedBandit, ) From eae42fd9a9a04b87be17eb5e527e9e01f10e48da Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Fri, 27 Nov 2020 23:32:34 +0530 Subject: [PATCH 22/26] Update to miniconda-v2 --- .github/workflows/test_linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_linux.yml b/.github/workflows/test_linux.yml index cad13ccf..72be4e69 100644 --- a/.github/workflows/test_linux.yml +++ b/.github/workflows/test_linux.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: goanpeca/setup-miniconda@v1 + - uses: goanpeca/setup-miniconda@v2 with: auto-update-conda: true python-version: ${{ matrix.python-version }} From 6ccd463c8f80475e69c57bc6465fdfa4147cf132 Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Fri, 27 Nov 2020 23:35:31 +0530 Subject: [PATCH 23/26] Update to miniconda-v2 (2) --- .github/workflows/test_linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_linux.yml b/.github/workflows/test_linux.yml index 72be4e69..6150e053 100644 --- a/.github/workflows/test_linux.yml +++ b/.github/workflows/test_linux.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: goanpeca/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v22 with: auto-update-conda: true python-version: ${{ matrix.python-version }} From f79f41e53a763d79fa0a2acbcb1b804f5a62a408 Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Fri, 27 Nov 2020 23:37:22 +0530 Subject: [PATCH 24/26] Update to miniconda-v2 (3) --- .github/workflows/test_linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_linux.yml b/.github/workflows/test_linux.yml index 6150e053..b7ae5ac6 100644 --- a/.github/workflows/test_linux.yml +++ b/.github/workflows/test_linux.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: conda-incubator/setup-miniconda@v22 + - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true python-version: ${{ matrix.python-version }} From d62f44e26fcef3ad8df9c49173d22a5225478590 Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Fri, 27 Nov 2020 23:42:07 +0530 Subject: [PATCH 25/26] Update to miniconda-v2 (4) --- .github/workflows/test_windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_windows.yml b/.github/workflows/test_windows.yml index 77be36c8..7462649b 100644 --- a/.github/workflows/test_windows.yml +++ b/.github/workflows/test_windows.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: goanpeca/setup-miniconda@v1 + - uses: conda-incubator/setup-miniconda@v21 with: auto-update-conda: true python-version: ${{ matrix.python-version }} From b2f71a659940138abf99744088095feae6b465e8 Mon Sep 17 00:00:00 2001 From: hades-rp2010 Date: Fri, 27 Nov 2020 23:47:13 +0530 Subject: [PATCH 26/26] Update to miniconda-v2 (4) --- .github/workflows/test_macos.yml | 2 +- .github/workflows/test_windows.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_macos.yml b/.github/workflows/test_macos.yml index 539cacc9..af7f4907 100644 --- a/.github/workflows/test_macos.yml +++ b/.github/workflows/test_macos.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: goanpeca/setup-miniconda@v1 + - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/test_windows.yml b/.github/workflows/test_windows.yml index 7462649b..9e01bc2d 100644 --- a/.github/workflows/test_windows.yml +++ b/.github/workflows/test_windows.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - - uses: conda-incubator/setup-miniconda@v21 + - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true python-version: ${{ matrix.python-version }}