From c7107d1226f54c1491159bb6abba54526f3e5966 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 13:10:22 +0300 Subject: [PATCH 01/11] wip IQL integration, config refactoring --- .gitignore | 1 + algorithms/minari/iql.py | 632 ++++++++++++++++++++++++++++++ algorithms/offline/iql.py | 1 - requirements/requirements.txt | 3 + requirements/requirements_dev.txt | 5 +- 5 files changed, 640 insertions(+), 2 deletions(-) create mode 100644 algorithms/minari/iql.py diff --git a/.gitignore b/.gitignore index ce35dd32..440cddbc 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ wandb_run.sh sweep_config.yaml .ml-job-preset.yml wandb +algorithms/minari/testing.py # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py new file mode 100644 index 00000000..8bd35181 --- /dev/null +++ b/algorithms/minari/iql.py @@ -0,0 +1,632 @@ +# source: https://github.com/gwthomas/IQL-PyTorch +# https://arxiv.org/pdf/2110.06169.pdf + +# Implementation TODOs: +# 1. iql_deterministic is true only for 2 datasets. Can we achieve same scores without it and remote it? +# 2. MLP class introduced bugs in the past. We should remove it and use simple nn.Sequential. +# 3. Refactor IQL updating code to be more consistent in style +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import copy +from dataclasses import asdict, dataclass +import os +import random +import uuid + +import minari +import gymnasium as gym +import numpy as np +import pyrallis +import torch +from torch.distributions import Normal +import torch.nn as nn +import torch.nn.functional as F +from torch.optim.lr_scheduler import CosineAnnealingLR +import wandb + +TensorBatch = List[torch.Tensor] + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" +EXP_ADV_MAX = 100.0 +LOG_STD_MIN = -20.0 +LOG_STD_MAX = 2.0 + + +@dataclass +class TrainConfig: + # wandb params + project: str = "CORL" + group: str = "IQL-Minari" + name: str = "iql" + # model params + gamma: float = 0.99 # Discount factor + tau: float = 0.005 # Target network update rate + beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q + iql_tau: float = 0.7 # Coefficient for asymmetric loss + iql_deterministic: bool = False # Use deterministic actor + vf_lr: float = 3e-4 # V function learning rate + qf_lr: float = 3e-4 # Critic learning rate + actor_lr: float = 3e-4 # Actor learning rate + actor_dropout: Optional[float] = None # Adroit uses dropout for policy network + # training params + dataset_id: str = "pen-human-v0" # Minari remote dataset name + update_steps: int = int(1e6) # Total training networks updates + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + normalize_state: bool = True # Normalize states + normalize_reward: bool = False # Normalize reward + # evaluation params + eval_every: int = int(5e3) # How often (time steps) we evaluate + eval_episodes: int = 10 # How many episodes run during evaluation + # general params + train_seed: int = 0 + eval_seed: int = 0 + checkpoints_path: Optional[str] = None # Save path + + def __post_init__(self): + self.name = f"{self.name}-{self.dataset_id}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def set_seed(seed: int, deterministic_torch: bool = False): + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + # epsilon should be already added in std. + return (state - state_mean) / state_std + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +# This is how reward normalization among all datasets is done in original IQL +def return_reward_range(dataset, max_episode_steps): + returns, lengths = [], [] + ep_ret, ep_len = 0.0, 0 + for r, d in zip(dataset["rewards"], dataset["terminals"]): + ep_ret += float(r) + ep_len += 1 + if d or ep_len == max_episode_steps: + returns.append(ep_ret) + lengths.append(ep_len) + ep_ret, ep_len = 0.0, 0 + lengths.append(ep_len) # but still keep track of number of steps + assert sum(lengths) == len(dataset["rewards"]) + return min(returns), max(returns) + + +def modify_reward(dataset, env_name, max_episode_steps=1000): + if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): + min_ret, max_ret = return_reward_range(dataset, max_episode_steps) + dataset["rewards"] /= max_ret - min_ret + dataset["rewards"] *= max_episode_steps + elif "antmaze" in env_name: + dataset["rewards"] -= 1.0 + + +# WARN: this will load full dataset in memory (which is OK for D4RL datasets) +def qlearning_dataset(dataset: minari.MinariDataset) -> Dict[str, np.ndarray]: + obs, next_obs, actions, rewards, dones = [], [], [], [], [] + + for episode in dataset: + obs.append(episode.observations[:-1].astype(np.float32)) + next_obs.append(episode.observations[1:].astype(np.float32)) + actions.append(episode.actions.astype(np.float32)) + rewards.append(episode.rewards) + dones.append(episode.terminations) + + return { + "observations": np.concatenate(obs), + "actions": np.concatenate(actions), + "next_observations": np.concatenate(next_obs), + "rewards": np.concatenate(rewards), + "terminals": np.concatenate(dones) + } + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array] after q_learning_dataset. + def load_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + + self._size = self._pointer = n_transitions + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, min(self._size, self._pointer), size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition(self): + # Use this method to add new data into the replay buffer during fine-tuning. + # I left it unimplemented since now we do not do fine-tuning. + raise NotImplementedError + + +def asymmetric_l2_loss(u: torch.Tensor, tau: float) -> torch.Tensor: + return torch.mean(torch.abs(tau - (u < 0).float()) * u**2) + + +class Squeeze(nn.Module): + def __init__(self, dim=-1): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.squeeze(dim=self.dim) + + +class MLP(nn.Module): + def __init__( + self, + dims, + activation_fn: Callable[[], nn.Module] = nn.ReLU, + output_activation_fn: Callable[[], nn.Module] = None, + squeeze_output: bool = False, + dropout: Optional[float] = None, + ): + super().__init__() + n_dims = len(dims) + if n_dims < 2: + raise ValueError("MLP requires at least two dims (input and output)") + + layers = [] + for i in range(n_dims - 2): + layers.append(nn.Linear(dims[i], dims[i + 1])) + layers.append(activation_fn()) + + if dropout is not None: + layers.append(nn.Dropout(dropout)) + + layers.append(nn.Linear(dims[-2], dims[-1])) + if output_activation_fn is not None: + layers.append(output_activation_fn()) + if squeeze_output: + if dims[-1] != 1: + raise ValueError("Last dim must be 1 when squeezing") + layers.append(Squeeze(-1)) + self.net = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) + + +class GaussianPolicy(nn.Module): + def __init__( + self, + state_dim: int, + act_dim: int, + max_action: float, + hidden_dim: int = 256, + n_hidden: int = 2, + dropout: Optional[float] = None, + ): + super().__init__() + self.net = MLP( + [state_dim, *([hidden_dim] * n_hidden), act_dim], + output_activation_fn=nn.Tanh, + ) + self.log_std = nn.Parameter(torch.zeros(act_dim, dtype=torch.float32)) + self.max_action = max_action + + def forward(self, obs: torch.Tensor) -> Normal: + mean = self.net(obs) + std = torch.exp(self.log_std.clamp(LOG_STD_MIN, LOG_STD_MAX)) + return Normal(mean, std) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + dist = self(state) + action = dist.mean if not self.training else dist.sample() + action = torch.clamp(self.max_action * action, -self.max_action, self.max_action) + return action.cpu().data.numpy().flatten() + + +class DeterministicPolicy(nn.Module): + def __init__( + self, + state_dim: int, + act_dim: int, + max_action: float, + hidden_dim: int = 256, + n_hidden: int = 2, + dropout: Optional[float] = None, + ): + super().__init__() + self.net = MLP( + [state_dim, *([hidden_dim] * n_hidden), act_dim], + output_activation_fn=nn.Tanh, + dropout=dropout, + ) + self.max_action = max_action + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + return self.net(obs) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu"): + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + return ( + torch.clamp(self(state) * self.max_action, -self.max_action, self.max_action) + .cpu() + .data.numpy() + .flatten() + ) + + +class TwinQ(nn.Module): + def __init__( + self, state_dim: int, action_dim: int, hidden_dim: int = 256, n_hidden: int = 2 + ): + super().__init__() + dims = [state_dim + action_dim, *([hidden_dim] * n_hidden), 1] + self.q1 = MLP(dims, squeeze_output=True) + self.q2 = MLP(dims, squeeze_output=True) + + def both( + self, state: torch.Tensor, action: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + sa = torch.cat([state, action], 1) + return self.q1(sa), self.q2(sa) + + def forward(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + return torch.min(*self.both(state, action)) + + +class ValueFunction(nn.Module): + def __init__(self, state_dim: int, hidden_dim: int = 256, n_hidden: int = 2): + super().__init__() + dims = [state_dim, *([hidden_dim] * n_hidden), 1] + self.v = MLP(dims, squeeze_output=True) + + def forward(self, state: torch.Tensor) -> torch.Tensor: + return self.v(state) + + +class ImplicitQLearning: + def __init__( + self, + max_action: float, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + actor_lr_scheduler: torch.optim.lr_scheduler.LRScheduler, + q_network: nn.Module, + q_optimizer: torch.optim.Optimizer, + v_network: nn.Module, + v_optimizer: torch.optim.Optimizer, + iql_tau: float = 0.7, + beta: float = 3.0, + gamma: float = 0.99, + tau: float = 0.005, + device: str = "cpu", + ): + self.max_action = max_action + self.qf = q_network + self.q_target = copy.deepcopy(self.qf).requires_grad_(False).to(device) + self.vf = v_network + self.actor = actor + self.v_optimizer = v_optimizer + self.q_optimizer = q_optimizer + self.actor_optimizer = actor_optimizer + self.actor_lr_scheduler = actor_lr_scheduler + self.iql_tau = iql_tau + self.beta = beta + self.gamma = gamma + self.tau = tau + + self.total_it = 0 + self.device = device + + def _update_v(self, observations, actions, log_dict) -> torch.Tensor: + # Update value function + with torch.no_grad(): + target_q = self.q_target(observations, actions) + + v = self.vf(observations) + adv = target_q - v + v_loss = asymmetric_l2_loss(adv, self.iql_tau) + log_dict["value_loss"] = v_loss.item() + self.v_optimizer.zero_grad() + v_loss.backward() + self.v_optimizer.step() + return adv + + def _update_q( + self, + next_v: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + terminals: torch.Tensor, + log_dict: Dict, + ): + targets = rewards + (1.0 - terminals.float()) * self.gamma * next_v.detach() + qs = self.qf.both(observations, actions) + q_loss = sum(F.mse_loss(q, targets) for q in qs) / len(qs) + log_dict["q_loss"] = q_loss.item() + self.q_optimizer.zero_grad() + q_loss.backward() + self.q_optimizer.step() + + # Update target Q network + soft_update(self.q_target, self.qf, self.tau) + + def _update_policy( + self, + adv: torch.Tensor, + observations: torch.Tensor, + actions: torch.Tensor, + log_dict: Dict, + ): + exp_adv = torch.exp(self.beta * adv.detach()).clamp(max=EXP_ADV_MAX) + policy_out = self.actor(observations) + if isinstance(policy_out, torch.distributions.Distribution): + bc_losses = -policy_out.log_prob(actions).sum(-1, keepdim=False) + elif torch.is_tensor(policy_out): + if policy_out.shape != actions.shape: + raise RuntimeError("Actions shape missmatch") + bc_losses = torch.sum((policy_out - actions) ** 2, dim=1) + else: + raise NotImplementedError + policy_loss = torch.mean(exp_adv * bc_losses) + log_dict["actor_loss"] = policy_loss.item() + self.actor_optimizer.zero_grad() + policy_loss.backward() + self.actor_optimizer.step() + self.actor_lr_scheduler.step() + + def train(self, batch: TensorBatch) -> Dict[str, float]: + self.total_it += 1 + ( + observations, + actions, + rewards, + next_observations, + dones, + ) = batch + log_dict = {} + + with torch.no_grad(): + next_v = self.vf(next_observations) + # Update value function + adv = self._update_v(observations, actions, log_dict) + rewards = rewards.squeeze(dim=-1) + dones = dones.squeeze(dim=-1) + # Update Q function + self._update_q(next_v, observations, actions, rewards, dones, log_dict) + # Update actor + self._update_policy(adv, observations, actions, log_dict) + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "qf": self.qf.state_dict(), + "q_optimizer": self.q_optimizer.state_dict(), + "vf": self.vf.state_dict(), + "v_optimizer": self.v_optimizer.state_dict(), + "actor": self.actor.state_dict(), + "actor_optimizer": self.actor_optimizer.state_dict(), + "actor_lr_scheduler": self.actor_lr_scheduler.state_dict(), + "total_it": self.total_it, + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.qf.load_state_dict(state_dict["qf"]) + self.q_optimizer.load_state_dict(state_dict["q_optimizer"]) + self.q_target = copy.deepcopy(self.qf) + + self.vf.load_state_dict(state_dict["vf"]) + self.v_optimizer.load_state_dict(state_dict["v_optimizer"]) + + self.actor.load_state_dict(state_dict["actor"]) + self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) + self.actor_lr_scheduler.load_state_dict(state_dict["actor_lr_scheduler"]) + + self.total_it = state_dict["total_it"] + + +@torch.no_grad() +def evaluate(env: gym.Env, actor: nn.Module, num_episodes: int, seed: int, device: str) -> np.ndarray: + actor.eval() + episode_rewards = [] + for i in range(num_episodes): + done = False + state, info = env.reset(seed=seed + i) + + episode_reward = 0.0 + while not done: + action = actor.act(state, device) + state, reward, terminated, truncated, info = env.step(action) + done = terminated or truncated + episode_reward += reward + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards) + + +@pyrallis.wrap() +def train(config: TrainConfig): + wandb.init( + config=asdict(config), + project=config.project, + group=config.group, + name=config.name, + id=str(uuid.uuid4()), + save_code=True + ) + minari.download_dataset(config.dataset_id) + dataset = minari.load_dataset(config.dataset_id) + + eval_env = dataset.recover_environment() + state_dim = eval_env.observation_space.shape[0] + action_dim = eval_env.action_space.shape[0] + max_action = float(eval_env.action_space.high[0]) + + dataset = qlearning_dataset(dataset) + if config.normalize_reward: + modify_reward(dataset, config.dataset_id) + + if config.normalize_state: + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + dataset["observations"] = normalize_states(dataset["observations"], state_mean, state_std) + dataset["next_observations"] = normalize_states(dataset["next_observations"], state_mean, state_std) + + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + DEVICE, + ) + replay_buffer.load_dataset(dataset) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seeds + set_seed(config.train_seed) + + q_network = TwinQ(state_dim, action_dim).to(DEVICE) + v_network = ValueFunction(state_dim).to(DEVICE) + if config.iql_deterministic: + actor = DeterministicPolicy(state_dim, action_dim, max_action, dropout=config.actor_dropout).to(DEVICE) + else: + actor = GaussianPolicy(state_dim, action_dim, max_action, dropout=config.actor_dropout).to(DEVICE) + + v_optimizer = torch.optim.Adam(v_network.parameters(), lr=config.vf_lr) + q_optimizer = torch.optim.Adam(q_network.parameters(), lr=config.qf_lr) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=config.actor_lr) + actor_lr_scheduler = CosineAnnealingLR(actor_optimizer, config.update_steps) + + trainer = ImplicitQLearning( + max_action=max_action, + actor=actor, + actor_optimizer=actor_optimizer, + actor_lr_scheduler=actor_lr_scheduler, + q_network=q_network, + q_optimizer=q_optimizer, + v_network=v_network, + v_optimizer=v_optimizer, + iql_tau=config.iql_tau, + beta=config.beta, + gamma=config.gamma, + tau=config.tau, + device=DEVICE + ) + + for step in range(int(config.update_steps)): + batch = [b.to(DEVICE) for b in replay_buffer.sample(config.batch_size)] + log_dict = trainer.train(batch) + + wandb.log(log_dict, step=trainer.total_it) + + if (step + 1) % config.eval_every == 0: + eval_scores = evaluate( + env=eval_env, + actor=actor, + num_episodes=config.eval_episodes, + seed=config.eval_seed, + device=DEVICE + ) + eval_score = eval_scores.mean() + # TODO: Minari does not support normalized scores for now. We will revisit this later. + # normalized_eval_score = env.get_normalized_score(eval_score) * 100.0 + wandb.log( + # {"d4rl_normalized_score": normalized_eval_score}, step=trainer.total_it + {"evaluation_return": eval_score}, step=trainer.total_it + ) + + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{step}.pt"), + ) + + +if __name__ == "__main__": + train() diff --git a/algorithms/offline/iql.py b/algorithms/offline/iql.py index 22aff0a2..bb6870e7 100644 --- a/algorithms/offline/iql.py +++ b/algorithms/offline/iql.py @@ -21,7 +21,6 @@ TensorBatch = List[torch.Tensor] - EXP_ADV_MAX = 100.0 LOG_STD_MIN = -20.0 LOG_STD_MAX = 2.0 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 00e3db5e..32599eee 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -9,3 +9,6 @@ gym[mujoco_py,classic_control]==0.23.0 torch==1.11.0+cu113 sortedcontainers==2.4.0 pyrallis==0.3.1 +# experimental, thus without a specific version for now +git+https://github.com/Farama-Foundation/Minari.git +gymnasium==0.28.1 \ No newline at end of file diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt index c1916cb5..0ac8e899 100644 --- a/requirements/requirements_dev.txt +++ b/requirements/requirements_dev.txt @@ -11,4 +11,7 @@ sortedcontainers==2.4.0 pyrallis==0.3.1 pre-commit==2.20.0 catalyst-codestyle==21.9.2 -pytest==7.1.2 \ No newline at end of file +pytest==7.1.2 +# experimental, thus without a specific version for now +git+https://github.com/Farama-Foundation/Minari.git +gymnasium==0.28.1 \ No newline at end of file From a0889bfe343b754862ba5452c1737d126c4fcca0 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 13:11:43 +0300 Subject: [PATCH 02/11] dropout bug fix --- algorithms/minari/iql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py index 8bd35181..956b7197 100644 --- a/algorithms/minari/iql.py +++ b/algorithms/minari/iql.py @@ -281,6 +281,7 @@ def __init__( self.net = MLP( [state_dim, *([hidden_dim] * n_hidden), act_dim], output_activation_fn=nn.Tanh, + dropout=dropout, ) self.log_std = nn.Parameter(torch.zeros(act_dim, dtype=torch.float32)) self.max_action = max_action From ab6dc4c6169288b7a439e82da3c3b1af0ccbf168 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 13:39:23 +0300 Subject: [PATCH 03/11] pre-commit style --- algorithms/minari/iql.py | 77 ++++++++++++++++++------------- algorithms/offline/iql.py | 1 + requirements/requirements.txt | 2 +- requirements/requirements_dev.txt | 2 +- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py index 956b7197..201589b9 100644 --- a/algorithms/minari/iql.py +++ b/algorithms/minari/iql.py @@ -2,8 +2,8 @@ # https://arxiv.org/pdf/2110.06169.pdf # Implementation TODOs: -# 1. iql_deterministic is true only for 2 datasets. Can we achieve same scores without it and remote it? -# 2. MLP class introduced bugs in the past. We should remove it and use simple nn.Sequential. +# 1. iql_deterministic is true only for 2 datasets. Can we remote it? +# 2. MLP class introduced bugs in the past. We should remove it. # 3. Refactor IQL updating code to be more consistent in style from typing import Any, Callable, Dict, List, Optional, Tuple, Union import copy @@ -12,8 +12,8 @@ import random import uuid -import minari import gymnasium as gym +import minari import numpy as np import pyrallis import torch @@ -21,6 +21,8 @@ import torch.nn as nn import torch.nn.functional as F from torch.optim.lr_scheduler import CosineAnnealingLR +from tqdm.auto import trange + import wandb TensorBatch = List[torch.Tensor] @@ -38,25 +40,25 @@ class TrainConfig: group: str = "IQL-Minari" name: str = "iql" # model params - gamma: float = 0.99 # Discount factor - tau: float = 0.005 # Target network update rate - beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q - iql_tau: float = 0.7 # Coefficient for asymmetric loss - iql_deterministic: bool = False # Use deterministic actor - vf_lr: float = 3e-4 # V function learning rate - qf_lr: float = 3e-4 # Critic learning rate - actor_lr: float = 3e-4 # Actor learning rate - actor_dropout: Optional[float] = None # Adroit uses dropout for policy network + gamma: float = 0.99 # Discount factor + tau: float = 0.005 # Target network update rate + beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q + iql_tau: float = 0.7 # Coefficient for asymmetric loss + iql_deterministic: bool = False # Use deterministic actor + vf_lr: float = 3e-4 # V function learning rate + qf_lr: float = 3e-4 # Critic learning rate + actor_lr: float = 3e-4 # Actor learning rate + actor_dropout: Optional[float] = None # Adroit uses dropout for policy network # training params - dataset_id: str = "pen-human-v0" # Minari remote dataset name - update_steps: int = int(1e6) # Total training networks updates - buffer_size: int = 2_000_000 # Replay buffer size - batch_size: int = 256 # Batch size for all networks - normalize_state: bool = True # Normalize states - normalize_reward: bool = False # Normalize reward + dataset_id: str = "pen-human-v0" # Minari remote dataset name + update_steps: int = int(1e6) # Total training networks updates + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + normalize_state: bool = True # Normalize states + normalize_reward: bool = False # Normalize reward # evaluation params - eval_every: int = int(5e3) # How often (time steps) we evaluate - eval_episodes: int = 10 # How many episodes run during evaluation + eval_every: int = int(5e3) # How often (time steps) we evaluate + eval_episodes: int = 10 # How many episodes run during evaluation # general params train_seed: int = 0 eval_seed: int = 0 @@ -153,7 +155,7 @@ def qlearning_dataset(dataset: minari.MinariDataset) -> Dict[str, np.ndarray]: "actions": np.concatenate(actions), "next_observations": np.concatenate(next_obs), "rewards": np.concatenate(rewards), - "terminals": np.concatenate(dones) + "terminals": np.concatenate(dones), } @@ -505,7 +507,9 @@ def load_state_dict(self, state_dict: Dict[str, Any]): @torch.no_grad() -def evaluate(env: gym.Env, actor: nn.Module, num_episodes: int, seed: int, device: str) -> np.ndarray: +def evaluate( + env: gym.Env, actor: nn.Module, num_episodes: int, seed: int, device: str +) -> np.ndarray: actor.eval() episode_rewards = [] for i in range(num_episodes): @@ -532,7 +536,7 @@ def train(config: TrainConfig): group=config.group, name=config.name, id=str(uuid.uuid4()), - save_code=True + save_code=True, ) minari.download_dataset(config.dataset_id) dataset = minari.load_dataset(config.dataset_id) @@ -551,8 +555,12 @@ def train(config: TrainConfig): else: state_mean, state_std = 0, 1 - dataset["observations"] = normalize_states(dataset["observations"], state_mean, state_std) - dataset["next_observations"] = normalize_states(dataset["next_observations"], state_mean, state_std) + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) replay_buffer = ReplayBuffer( @@ -575,9 +583,13 @@ def train(config: TrainConfig): q_network = TwinQ(state_dim, action_dim).to(DEVICE) v_network = ValueFunction(state_dim).to(DEVICE) if config.iql_deterministic: - actor = DeterministicPolicy(state_dim, action_dim, max_action, dropout=config.actor_dropout).to(DEVICE) + actor = DeterministicPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ).to(DEVICE) else: - actor = GaussianPolicy(state_dim, action_dim, max_action, dropout=config.actor_dropout).to(DEVICE) + actor = GaussianPolicy( + state_dim, action_dim, max_action, dropout=config.actor_dropout + ).to(DEVICE) v_optimizer = torch.optim.Adam(v_network.parameters(), lr=config.vf_lr) q_optimizer = torch.optim.Adam(q_network.parameters(), lr=config.qf_lr) @@ -597,10 +609,10 @@ def train(config: TrainConfig): beta=config.beta, gamma=config.gamma, tau=config.tau, - device=DEVICE + device=DEVICE, ) - for step in range(int(config.update_steps)): + for step in trange(config.update_steps): batch = [b.to(DEVICE) for b in replay_buffer.sample(config.batch_size)] log_dict = trainer.train(batch) @@ -612,14 +624,15 @@ def train(config: TrainConfig): actor=actor, num_episodes=config.eval_episodes, seed=config.eval_seed, - device=DEVICE + device=DEVICE, ) eval_score = eval_scores.mean() - # TODO: Minari does not support normalized scores for now. We will revisit this later. + # TODO: Minari does not have normalized scores. We will revisit this later. # normalized_eval_score = env.get_normalized_score(eval_score) * 100.0 wandb.log( # {"d4rl_normalized_score": normalized_eval_score}, step=trainer.total_it - {"evaluation_return": eval_score}, step=trainer.total_it + {"evaluation_return": eval_score}, + step=trainer.total_it, ) if config.checkpoints_path is not None: diff --git a/algorithms/offline/iql.py b/algorithms/offline/iql.py index bb6870e7..22aff0a2 100644 --- a/algorithms/offline/iql.py +++ b/algorithms/offline/iql.py @@ -21,6 +21,7 @@ TensorBatch = List[torch.Tensor] + EXP_ADV_MAX = 100.0 LOG_STD_MIN = -20.0 LOG_STD_MAX = 2.0 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 32599eee..7249d0e4 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ # Main dependencies git+https://github.com/tinkoff-ai/d4rl@master#egg=d4rl -tqdm==4.64.0 +tqdm==4.65.0 wandb==0.12.21 mujoco-py==2.1.2.14 numpy==1.23.1 diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt index 0ac8e899..51fc01a3 100644 --- a/requirements/requirements_dev.txt +++ b/requirements/requirements_dev.txt @@ -1,6 +1,6 @@ # Main dependencies git+https://github.com/tinkoff-ai/d4rl@master#egg=d4rl -tqdm==4.64.0 +tqdm==4.65.0 wandb==0.12.21 mujoco-py==2.1.2.14 numpy==1.23.1 From a64275eddfb1d7e4882721d97d5c610561e10bf1 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 13:47:30 +0300 Subject: [PATCH 04/11] fix linter --- algorithms/minari/iql.py | 1 - 1 file changed, 1 deletion(-) diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py index 201589b9..5e818b91 100644 --- a/algorithms/minari/iql.py +++ b/algorithms/minari/iql.py @@ -22,7 +22,6 @@ import torch.nn.functional as F from torch.optim.lr_scheduler import CosineAnnealingLR from tqdm.auto import trange - import wandb TensorBatch = List[torch.Tensor] From 9388c902bc6c663e326d24a848bcbd973a7b0985 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 17:45:39 +0300 Subject: [PATCH 05/11] added bc with minari --- algorithms/minari/any_percent_bc.py | 375 ++++++++++++++++++++++++++++ algorithms/minari/iql.py | 10 +- 2 files changed, 377 insertions(+), 8 deletions(-) create mode 100644 algorithms/minari/any_percent_bc.py diff --git a/algorithms/minari/any_percent_bc.py b/algorithms/minari/any_percent_bc.py new file mode 100644 index 00000000..f511d56d --- /dev/null +++ b/algorithms/minari/any_percent_bc.py @@ -0,0 +1,375 @@ +from typing import Any, Dict, List, Optional, Tuple, Union +from dataclasses import asdict, dataclass +import os +import random +import uuid + +import gymnasium as gym +import minari +import numpy as np +import pyrallis +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm.auto import trange + +import wandb + +TensorBatch = List[torch.Tensor] +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + + +@dataclass +class TrainConfig: + # wandb params + project: str = "CORL" + group: str = "BC-Minari" + name: str = "bc" + # model params + gamma: float = 0.99 # Discount factor + top_fraction: float = 0.1 # Best data fraction to use + # training params + dataset_id: str = "pen-human-v0" # Minari remote dataset name + update_steps: int = int(1e6) # Total training networks updates + buffer_size: int = 2_000_000 # Replay buffer size + batch_size: int = 256 # Batch size for all networks + normalize_state: bool = True # Normalize states + # evaluation params + eval_every: int = int(5e3) # How often (time steps) we evaluate + eval_episodes: int = 10 # How many episodes run during evaluation + # general params + train_seed: int = 0 + eval_seed: int = 0 + checkpoints_path: Optional[str] = None # Save path + + def __post_init__(self): + self.name = f"{self.name}-{self.dataset_id}-{str(uuid.uuid4())[:8]}" + if self.checkpoints_path is not None: + self.checkpoints_path = os.path.join(self.checkpoints_path, self.name) + + +def set_seed(seed: int, deterministic_torch: bool = False): + os.environ["PYTHONHASHSEED"] = str(seed) + np.random.seed(seed) + random.seed(seed) + torch.manual_seed(seed) + torch.use_deterministic_algorithms(deterministic_torch) + + +def soft_update(target: nn.Module, source: nn.Module, tau: float): + for target_param, source_param in zip(target.parameters(), source.parameters()): + target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) + + +def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: + mean = states.mean(0) + std = states.std(0) + eps + return mean, std + + +def normalize_states(states: np.ndarray, mean: np.ndarray, std: np.ndarray): + return (states - mean) / std + + +def wrap_env( + env: gym.Env, + state_mean: Union[np.ndarray, float] = 0.0, + state_std: Union[np.ndarray, float] = 1.0, + reward_scale: float = 1.0, +) -> gym.Env: + # PEP 8: E731 do not assign a lambda expression, use a def + def normalize_state(state): + # epsilon should be already added in std. + return (state - state_mean) / state_std + + def scale_reward(reward): + # Please be careful, here reward is multiplied by scale! + return reward_scale * reward + + env = gym.wrappers.TransformObservation(env, normalize_state) + if reward_scale != 1.0: + env = gym.wrappers.TransformReward(env, scale_reward) + return env + + +def discounted_return(x: np.ndarray, gamma: float) -> np.ndarray: + total_return = x[-1] + for t in reversed(range(x.shape[0] - 1)): + total_return = x[t] + gamma * total_return + return total_return + + +def best_trajectories_ids( + dataset: minari.MinariDataset, top_fraction: float, gamma: float +) -> List[int]: + ids_and_return = [ + (episode.id, discounted_return(episode.rewards, gamma)) for episode in dataset + ] + ids_and_returns = sorted(ids_and_return, key=lambda t: -t[1]) + + top_ids = [id for (id, r) in ids_and_returns] + top_ids = top_ids[: max(1, int(top_fraction * len(ids_and_returns)))] + assert len(top_ids) > 0 + return top_ids + + +# WARN: this will load full dataset in memory (which is OK for D4RL datasets) +def qlearning_dataset( + dataset: minari.MinariDataset, traj_ids: List[int] +) -> Dict[str, np.ndarray]: + obs, next_obs, actions, rewards, dones = [], [], [], [], [] + + for episode in dataset.iterate_episodes(episode_indices=traj_ids): + obs.append(episode.observations[:-1].astype(np.float32)) + next_obs.append(episode.observations[1:].astype(np.float32)) + actions.append(episode.actions.astype(np.float32)) + rewards.append(episode.rewards) + dones.append(episode.terminations) + + return { + "observations": np.concatenate(obs), + "actions": np.concatenate(actions), + "next_observations": np.concatenate(next_obs), + "rewards": np.concatenate(rewards), + "terminals": np.concatenate(dones), + } + + +class ReplayBuffer: + def __init__( + self, + state_dim: int, + action_dim: int, + buffer_size: int, + device: str = "cpu", + ): + self._buffer_size = buffer_size + self._pointer = 0 + self._size = 0 + + self._states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._actions = torch.zeros( + (buffer_size, action_dim), dtype=torch.float32, device=device + ) + self._rewards = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._next_states = torch.zeros( + (buffer_size, state_dim), dtype=torch.float32, device=device + ) + self._dones = torch.zeros((buffer_size, 1), dtype=torch.float32, device=device) + self._device = device + + def _to_tensor(self, data: np.ndarray) -> torch.Tensor: + return torch.tensor(data, dtype=torch.float32, device=self._device) + + # Loads data in d4rl format, i.e. from Dict[str, np.array] after q_learning_dataset. + def load_dataset(self, data: Dict[str, np.ndarray]): + if self._size != 0: + raise ValueError("Trying to load data into non-empty replay buffer") + + n_transitions = data["observations"].shape[0] + if n_transitions > self._buffer_size: + raise ValueError( + "Replay buffer is smaller than the dataset you are trying to load!" + ) + self._states[:n_transitions] = self._to_tensor(data["observations"]) + self._actions[:n_transitions] = self._to_tensor(data["actions"]) + self._rewards[:n_transitions] = self._to_tensor(data["rewards"][..., None]) + self._next_states[:n_transitions] = self._to_tensor(data["next_observations"]) + self._dones[:n_transitions] = self._to_tensor(data["terminals"][..., None]) + + self._size = self._pointer = n_transitions + print(f"Dataset size: {n_transitions}") + + def sample(self, batch_size: int) -> TensorBatch: + indices = np.random.randint(0, min(self._size, self._pointer), size=batch_size) + states = self._states[indices] + actions = self._actions[indices] + rewards = self._rewards[indices] + next_states = self._next_states[indices] + dones = self._dones[indices] + return [states, actions, rewards, next_states, dones] + + def add_transition(self): + # Use this method to add new data into the replay buffer during fine-tuning. + # I left it unimplemented since now we do not do fine-tuning. + raise NotImplementedError + + +class Actor(nn.Module): + def __init__(self, state_dim: int, action_dim: int, max_action: float): + super(Actor, self).__init__() + self.net = nn.Sequential( + nn.Linear(state_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, action_dim), + nn.Tanh(), + ) + self.max_action = max_action + + def forward(self, state: torch.Tensor) -> torch.Tensor: + return self.max_action * self.net(state) + + @torch.no_grad() + def act(self, state: np.ndarray, device: str = "cpu") -> np.ndarray: + state = torch.tensor(state.reshape(1, -1), device=device, dtype=torch.float32) + return self(state).cpu().data.numpy().flatten() + + +class BC: + def __init__( + self, + max_action: float, + actor: nn.Module, + actor_optimizer: torch.optim.Optimizer, + device: str = "cpu", + ): + self.actor = actor + self.actor_optimizer = actor_optimizer + self.max_action = max_action + self.device = device + + def train(self, batch: TensorBatch) -> Dict[str, float]: + log_dict = {} + state, action, _, _, _ = batch + + # Compute actor loss + pi = self.actor(state) + actor_loss = F.mse_loss(pi, action) + log_dict["actor_loss"] = actor_loss.item() + # Optimize the actor + self.actor_optimizer.zero_grad() + actor_loss.backward() + self.actor_optimizer.step() + + return log_dict + + def state_dict(self) -> Dict[str, Any]: + return { + "actor": self.actor.state_dict(), + "actor_optimizer": self.actor_optimizer.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]): + self.actor.load_state_dict(state_dict["actor"]) + self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) + + +@torch.no_grad() +def evaluate( + env: gym.Env, actor: nn.Module, num_episodes: int, seed: int, device: str +) -> np.ndarray: + actor.eval() + episode_rewards = [] + for i in range(num_episodes): + done = False + state, info = env.reset(seed=seed + i) + + episode_reward = 0.0 + while not done: + action = actor.act(state, device) + state, reward, terminated, truncated, info = env.step(action) + done = terminated or truncated + episode_reward += reward + episode_rewards.append(episode_reward) + + actor.train() + return np.asarray(episode_rewards) + + +@pyrallis.wrap() +def train(config: TrainConfig): + wandb.init( + config=asdict(config), + project=config.project, + group=config.group, + name=config.name, + id=str(uuid.uuid4()), + save_code=True, + ) + minari.download_dataset(config.dataset_id) + dataset = minari.load_dataset(config.dataset_id) + + eval_env = dataset.recover_environment() + state_dim = eval_env.observation_space.shape[0] + action_dim = eval_env.action_space.shape[0] + max_action = float(eval_env.action_space.high[0]) + + dataset = qlearning_dataset( + dataset=dataset, + traj_ids=best_trajectories_ids(dataset, config.top_fraction, config.gamma), + ) + if config.normalize_state: + state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + else: + state_mean, state_std = 0, 1 + + dataset["observations"] = normalize_states( + dataset["observations"], state_mean, state_std + ) + dataset["next_observations"] = normalize_states( + dataset["next_observations"], state_mean, state_std + ) + eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) + replay_buffer = ReplayBuffer( + state_dim, + action_dim, + config.buffer_size, + DEVICE, + ) + replay_buffer.load_dataset(dataset) + + if config.checkpoints_path is not None: + print(f"Checkpoints path: {config.checkpoints_path}") + os.makedirs(config.checkpoints_path, exist_ok=True) + with open(os.path.join(config.checkpoints_path, "config.yaml"), "w") as f: + pyrallis.dump(config, f) + + # Set seed + set_seed(config.train_seed) + + actor = Actor(state_dim, action_dim, max_action).to(DEVICE) + actor_optimizer = torch.optim.Adam(actor.parameters(), lr=3e-4) + + trainer = BC( + max_action=max_action, + actor=actor, + actor_optimizer=actor_optimizer, + device=DEVICE, + ) + + for step in trange(config.update_steps): + batch = [b.to(DEVICE) for b in replay_buffer.sample(config.batch_size)] + log_dict = trainer.train(batch) + + wandb.log(log_dict, step=step) + + if (step + 1) % config.eval_every == 0: + eval_scores = evaluate( + env=eval_env, + actor=actor, + num_episodes=config.eval_episodes, + seed=config.eval_seed, + device=DEVICE, + ) + eval_score = eval_scores.mean() + # TODO: Minari does not have normalized scores. We will revisit this later. + # normalized_eval_score = env.get_normalized_score(eval_score) * 100.0 + wandb.log( + # {"d4rl_normalized_score": normalized_eval_score}, + {"evaluation_return": eval_score}, + step=step, + ) + + if config.checkpoints_path is not None: + torch.save( + trainer.state_dict(), + os.path.join(config.checkpoints_path, f"checkpoint_{step}.pt"), + ) + + +if __name__ == "__main__": + train() diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py index 5e818b91..09862872 100644 --- a/algorithms/minari/iql.py +++ b/algorithms/minari/iql.py @@ -392,8 +392,6 @@ def __init__( self.beta = beta self.gamma = gamma self.tau = tau - - self.total_it = 0 self.device = device def _update_v(self, observations, actions, log_dict) -> torch.Tensor: @@ -455,7 +453,6 @@ def _update_policy( self.actor_lr_scheduler.step() def train(self, batch: TensorBatch) -> Dict[str, float]: - self.total_it += 1 ( observations, actions, @@ -487,7 +484,6 @@ def state_dict(self) -> Dict[str, Any]: "actor": self.actor.state_dict(), "actor_optimizer": self.actor_optimizer.state_dict(), "actor_lr_scheduler": self.actor_lr_scheduler.state_dict(), - "total_it": self.total_it, } def load_state_dict(self, state_dict: Dict[str, Any]): @@ -502,8 +498,6 @@ def load_state_dict(self, state_dict: Dict[str, Any]): self.actor_optimizer.load_state_dict(state_dict["actor_optimizer"]) self.actor_lr_scheduler.load_state_dict(state_dict["actor_lr_scheduler"]) - self.total_it = state_dict["total_it"] - @torch.no_grad() def evaluate( @@ -615,7 +609,7 @@ def train(config: TrainConfig): batch = [b.to(DEVICE) for b in replay_buffer.sample(config.batch_size)] log_dict = trainer.train(batch) - wandb.log(log_dict, step=trainer.total_it) + wandb.log(log_dict, step=step) if (step + 1) % config.eval_every == 0: eval_scores = evaluate( @@ -631,7 +625,7 @@ def train(config: TrainConfig): wandb.log( # {"d4rl_normalized_score": normalized_eval_score}, step=trainer.total_it {"evaluation_return": eval_score}, - step=trainer.total_it, + step=step, ) if config.checkpoints_path is not None: From a97ced0b226ed9802b70afaf51dd3ca1917e8db0 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 17:47:25 +0300 Subject: [PATCH 06/11] removed local from gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 440cddbc..ce35dd32 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ wandb_run.sh sweep_config.yaml .ml-job-preset.yml wandb -algorithms/minari/testing.py # Byte-compiled / optimized / DLL files __pycache__/ From d7098bff483c251405f245f6bf1a47034f14f634 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 17:55:31 +0300 Subject: [PATCH 07/11] linter fix --- algorithms/minari/any_percent_bc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/algorithms/minari/any_percent_bc.py b/algorithms/minari/any_percent_bc.py index f511d56d..878acf5e 100644 --- a/algorithms/minari/any_percent_bc.py +++ b/algorithms/minari/any_percent_bc.py @@ -12,7 +12,6 @@ import torch.nn as nn import torch.nn.functional as F from tqdm.auto import trange - import wandb TensorBatch = List[torch.Tensor] From 733f08e1e0a4704bb5ca3e1364155797710d6433 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 4 Jul 2023 18:04:13 +0300 Subject: [PATCH 08/11] removed unused function --- algorithms/minari/any_percent_bc.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/algorithms/minari/any_percent_bc.py b/algorithms/minari/any_percent_bc.py index 878acf5e..b0da0f6c 100644 --- a/algorithms/minari/any_percent_bc.py +++ b/algorithms/minari/any_percent_bc.py @@ -55,11 +55,6 @@ def set_seed(seed: int, deterministic_torch: bool = False): torch.use_deterministic_algorithms(deterministic_torch) -def soft_update(target: nn.Module, source: nn.Module, tau: float): - for target_param, source_param in zip(target.parameters(), source.parameters()): - target_param.data.copy_((1 - tau) * target_param.data + tau * source_param.data) - - def compute_mean_std(states: np.ndarray, eps: float) -> Tuple[np.ndarray, np.ndarray]: mean = states.mean(0) std = states.std(0) + eps From 3260e021c1678aa1b987ea597f2cc5b10de04125 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 18 Jul 2023 16:08:29 +0300 Subject: [PATCH 09/11] added normalized scores logging, typings --- algorithms/minari/any_percent_bc.py | 33 +++++++++++----------- algorithms/minari/iql.py | 43 ++++++++++++++++------------- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/algorithms/minari/any_percent_bc.py b/algorithms/minari/any_percent_bc.py index b0da0f6c..d7ca44b3 100644 --- a/algorithms/minari/any_percent_bc.py +++ b/algorithms/minari/any_percent_bc.py @@ -1,4 +1,5 @@ from typing import Any, Dict, List, Optional, Tuple, Union +import contextlib from dataclasses import asdict, dataclass import os import random @@ -12,6 +13,7 @@ import torch.nn as nn import torch.nn.functional as F from tqdm.auto import trange + import wandb TensorBatch = List[torch.Tensor] @@ -28,7 +30,7 @@ class TrainConfig: gamma: float = 0.99 # Discount factor top_fraction: float = 0.1 # Best data fraction to use # training params - dataset_id: str = "pen-human-v0" # Minari remote dataset name + dataset_id: str = "pen-human-v1" # Minari remote dataset name update_steps: int = int(1e6) # Total training networks updates buffer_size: int = 2_000_000 # Replay buffer size batch_size: int = 256 # Batch size for all networks @@ -292,20 +294,20 @@ def train(config: TrainConfig): action_dim = eval_env.action_space.shape[0] max_action = float(eval_env.action_space.high[0]) - dataset = qlearning_dataset( + qdataset = qlearning_dataset( dataset=dataset, traj_ids=best_trajectories_ids(dataset, config.top_fraction, config.gamma), ) if config.normalize_state: - state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + state_mean, state_std = compute_mean_std(qdataset["observations"], eps=1e-3) else: state_mean, state_std = 0, 1 - dataset["observations"] = normalize_states( - dataset["observations"], state_mean, state_std + qdataset["observations"] = normalize_states( + qdataset["observations"], state_mean, state_std ) - dataset["next_observations"] = normalize_states( - dataset["next_observations"], state_mean, state_std + qdataset["next_observations"] = normalize_states( + qdataset["next_observations"], state_mean, state_std ) eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) replay_buffer = ReplayBuffer( @@ -314,7 +316,7 @@ def train(config: TrainConfig): config.buffer_size, DEVICE, ) - replay_buffer.load_dataset(dataset) + replay_buffer.load_dataset(qdataset) if config.checkpoints_path is not None: print(f"Checkpoints path: {config.checkpoints_path}") @@ -349,14 +351,13 @@ def train(config: TrainConfig): seed=config.eval_seed, device=DEVICE, ) - eval_score = eval_scores.mean() - # TODO: Minari does not have normalized scores. We will revisit this later. - # normalized_eval_score = env.get_normalized_score(eval_score) * 100.0 - wandb.log( - # {"d4rl_normalized_score": normalized_eval_score}, - {"evaluation_return": eval_score}, - step=step, - ) + wandb.log({"evaluation_return": eval_scores.mean()}, step=step) + # optional normalized score logging, only if dataset has reference scores + with contextlib.suppress(ValueError): + normalized_score = ( + minari.get_normalized_score(dataset, eval_scores).mean() * 100 + ) + wandb.log({"normalized_score": normalized_score}, step=step) if config.checkpoints_path is not None: torch.save( diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py index 09862872..9902eaa6 100644 --- a/algorithms/minari/iql.py +++ b/algorithms/minari/iql.py @@ -6,6 +6,7 @@ # 2. MLP class introduced bugs in the past. We should remove it. # 3. Refactor IQL updating code to be more consistent in style from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import contextlib import copy from dataclasses import asdict, dataclass import os @@ -22,6 +23,7 @@ import torch.nn.functional as F from torch.optim.lr_scheduler import CosineAnnealingLR from tqdm.auto import trange + import wandb TensorBatch = List[torch.Tensor] @@ -49,7 +51,7 @@ class TrainConfig: actor_lr: float = 3e-4 # Actor learning rate actor_dropout: Optional[float] = None # Adroit uses dropout for policy network # training params - dataset_id: str = "pen-human-v0" # Minari remote dataset name + dataset_id: str = "pen-human-v1" # Minari remote dataset name update_steps: int = int(1e6) # Total training networks updates buffer_size: int = 2_000_000 # Replay buffer size batch_size: int = 256 # Batch size for all networks @@ -114,7 +116,9 @@ def scale_reward(reward): # This is how reward normalization among all datasets is done in original IQL -def return_reward_range(dataset, max_episode_steps): +def return_reward_range( + dataset: Dict[str, np.ndarray], max_episode_steps: int +) -> Tuple[float, float]: returns, lengths = [], [] ep_ret, ep_len = 0.0, 0 for r, d in zip(dataset["rewards"], dataset["terminals"]): @@ -129,7 +133,9 @@ def return_reward_range(dataset, max_episode_steps): return min(returns), max(returns) -def modify_reward(dataset, env_name, max_episode_steps=1000): +def modify_reward( + dataset: Dict[str, np.ndarray], env_name: str, max_episode_steps: int = 1000 +): if any(s in env_name for s in ("halfcheetah", "hopper", "walker2d")): min_ret, max_ret = return_reward_range(dataset, max_episode_steps) dataset["rewards"] /= max_ret - min_ret @@ -539,20 +545,20 @@ def train(config: TrainConfig): action_dim = eval_env.action_space.shape[0] max_action = float(eval_env.action_space.high[0]) - dataset = qlearning_dataset(dataset) + qdataset = qlearning_dataset(dataset) if config.normalize_reward: - modify_reward(dataset, config.dataset_id) + modify_reward(qdataset, config.dataset_id) if config.normalize_state: - state_mean, state_std = compute_mean_std(dataset["observations"], eps=1e-3) + state_mean, state_std = compute_mean_std(qdataset["observations"], eps=1e-3) else: state_mean, state_std = 0, 1 - dataset["observations"] = normalize_states( - dataset["observations"], state_mean, state_std + qdataset["observations"] = normalize_states( + qdataset["observations"], state_mean, state_std ) - dataset["next_observations"] = normalize_states( - dataset["next_observations"], state_mean, state_std + qdataset["next_observations"] = normalize_states( + qdataset["next_observations"], state_mean, state_std ) eval_env = wrap_env(eval_env, state_mean=state_mean, state_std=state_std) @@ -562,7 +568,7 @@ def train(config: TrainConfig): config.buffer_size, DEVICE, ) - replay_buffer.load_dataset(dataset) + replay_buffer.load_dataset(qdataset) if config.checkpoints_path is not None: print(f"Checkpoints path: {config.checkpoints_path}") @@ -619,14 +625,13 @@ def train(config: TrainConfig): seed=config.eval_seed, device=DEVICE, ) - eval_score = eval_scores.mean() - # TODO: Minari does not have normalized scores. We will revisit this later. - # normalized_eval_score = env.get_normalized_score(eval_score) * 100.0 - wandb.log( - # {"d4rl_normalized_score": normalized_eval_score}, step=trainer.total_it - {"evaluation_return": eval_score}, - step=step, - ) + wandb.log({"evaluation_return": eval_scores.mean()}, step=step) + # optional normalized score logging, only if dataset has reference scores + with contextlib.suppress(ValueError): + normalized_score = ( + minari.get_normalized_score(dataset, eval_scores).mean() * 100 + ) + wandb.log({"normalized_score": normalized_score}, step=step) if config.checkpoints_path is not None: torch.save( From c1c8a3d2d2cf5a9416cbc96d27a011ce21fcdc79 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Tue, 18 Jul 2023 17:24:49 +0300 Subject: [PATCH 10/11] configs for adroit datasets --- configs/minari/offline/bc/door/cloned_v1.yaml | 15 ++++++++++++ configs/minari/offline/bc/door/expert_v1.yaml | 15 ++++++++++++ configs/minari/offline/bc/door/human_v1.yaml | 15 ++++++++++++ .../minari/offline/bc/hammer/cloned_v1.yaml | 15 ++++++++++++ .../minari/offline/bc/hammer/expert_v1.yaml | 15 ++++++++++++ .../minari/offline/bc/hammer/human_v1.yaml | 15 ++++++++++++ configs/minari/offline/bc/pen/cloned_v1.yaml | 15 ++++++++++++ configs/minari/offline/bc/pen/expert_v1.yaml | 15 ++++++++++++ configs/minari/offline/bc/pen/human_v1.yaml | 15 ++++++++++++ .../minari/offline/bc/relocate/cloned_v1.yaml | 15 ++++++++++++ .../minari/offline/bc/relocate/expert_v1.yaml | 15 ++++++++++++ .../minari/offline/bc/relocate/human_v1.yaml | 16 +++++++++++++ .../minari/offline/bc_10/door/cloned_v1.yaml | 15 ++++++++++++ .../minari/offline/bc_10/door/expert_v1.yaml | 15 ++++++++++++ .../minari/offline/bc_10/door/human_v1.yaml | 15 ++++++++++++ .../offline/bc_10/hammer/cloned_v1.yaml | 15 ++++++++++++ .../offline/bc_10/hammer/expert_v1.yaml | 15 ++++++++++++ .../minari/offline/bc_10/hammer/human_v1.yaml | 15 ++++++++++++ .../minari/offline/bc_10/pen/cloned_v1.yaml | 15 ++++++++++++ .../minari/offline/bc_10/pen/expert_v1.yaml | 15 ++++++++++++ .../minari/offline/bc_10/pen/human_v1.yaml | 15 ++++++++++++ .../offline/bc_10/relocate/cloned_v1.yaml | 15 ++++++++++++ .../offline/bc_10/relocate/expert_v1.yaml | 15 ++++++++++++ .../offline/bc_10/relocate/human_v1.yaml | 16 +++++++++++++ .../minari/offline/iql/door/cloned_v1.yaml | 23 +++++++++++++++++++ .../minari/offline/iql/door/expert_v1.yaml | 23 +++++++++++++++++++ configs/minari/offline/iql/door/human_v1.yaml | 23 +++++++++++++++++++ .../minari/offline/iql/hammer/cloned_v1.yaml | 23 +++++++++++++++++++ .../minari/offline/iql/hammer/expert_v1.yaml | 23 +++++++++++++++++++ .../minari/offline/iql/hammer/human_v1.yaml | 23 +++++++++++++++++++ configs/minari/offline/iql/pen/cloned_v1.yaml | 23 +++++++++++++++++++ configs/minari/offline/iql/pen/expert_v1.yaml | 23 +++++++++++++++++++ configs/minari/offline/iql/pen/human_v1.yaml | 23 +++++++++++++++++++ .../offline/iql/relocate/cloned_v1.yaml | 23 +++++++++++++++++++ .../offline/iql/relocate/expert_v1.yaml | 23 +++++++++++++++++++ .../minari/offline/iql/relocate/human_v1.yaml | 23 +++++++++++++++++++ 36 files changed, 638 insertions(+) create mode 100644 configs/minari/offline/bc/door/cloned_v1.yaml create mode 100644 configs/minari/offline/bc/door/expert_v1.yaml create mode 100644 configs/minari/offline/bc/door/human_v1.yaml create mode 100644 configs/minari/offline/bc/hammer/cloned_v1.yaml create mode 100644 configs/minari/offline/bc/hammer/expert_v1.yaml create mode 100644 configs/minari/offline/bc/hammer/human_v1.yaml create mode 100644 configs/minari/offline/bc/pen/cloned_v1.yaml create mode 100644 configs/minari/offline/bc/pen/expert_v1.yaml create mode 100644 configs/minari/offline/bc/pen/human_v1.yaml create mode 100644 configs/minari/offline/bc/relocate/cloned_v1.yaml create mode 100644 configs/minari/offline/bc/relocate/expert_v1.yaml create mode 100644 configs/minari/offline/bc/relocate/human_v1.yaml create mode 100644 configs/minari/offline/bc_10/door/cloned_v1.yaml create mode 100644 configs/minari/offline/bc_10/door/expert_v1.yaml create mode 100644 configs/minari/offline/bc_10/door/human_v1.yaml create mode 100644 configs/minari/offline/bc_10/hammer/cloned_v1.yaml create mode 100644 configs/minari/offline/bc_10/hammer/expert_v1.yaml create mode 100644 configs/minari/offline/bc_10/hammer/human_v1.yaml create mode 100644 configs/minari/offline/bc_10/pen/cloned_v1.yaml create mode 100644 configs/minari/offline/bc_10/pen/expert_v1.yaml create mode 100644 configs/minari/offline/bc_10/pen/human_v1.yaml create mode 100644 configs/minari/offline/bc_10/relocate/cloned_v1.yaml create mode 100644 configs/minari/offline/bc_10/relocate/expert_v1.yaml create mode 100644 configs/minari/offline/bc_10/relocate/human_v1.yaml create mode 100644 configs/minari/offline/iql/door/cloned_v1.yaml create mode 100644 configs/minari/offline/iql/door/expert_v1.yaml create mode 100644 configs/minari/offline/iql/door/human_v1.yaml create mode 100644 configs/minari/offline/iql/hammer/cloned_v1.yaml create mode 100644 configs/minari/offline/iql/hammer/expert_v1.yaml create mode 100644 configs/minari/offline/iql/hammer/human_v1.yaml create mode 100644 configs/minari/offline/iql/pen/cloned_v1.yaml create mode 100644 configs/minari/offline/iql/pen/expert_v1.yaml create mode 100644 configs/minari/offline/iql/pen/human_v1.yaml create mode 100644 configs/minari/offline/iql/relocate/cloned_v1.yaml create mode 100644 configs/minari/offline/iql/relocate/expert_v1.yaml create mode 100644 configs/minari/offline/iql/relocate/human_v1.yaml diff --git a/configs/minari/offline/bc/door/cloned_v1.yaml b/configs/minari/offline/bc/door/cloned_v1.yaml new file mode 100644 index 00000000..c1d7118c --- /dev/null +++ b/configs/minari/offline/bc/door/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-door-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/door/expert_v1.yaml b/configs/minari/offline/bc/door/expert_v1.yaml new file mode 100644 index 00000000..b4d0d4be --- /dev/null +++ b/configs/minari/offline/bc/door/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-door-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/door/human_v1.yaml b/configs/minari/offline/bc/door/human_v1.yaml new file mode 100644 index 00000000..4b0024a0 --- /dev/null +++ b/configs/minari/offline/bc/door/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-door-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/hammer/cloned_v1.yaml b/configs/minari/offline/bc/hammer/cloned_v1.yaml new file mode 100644 index 00000000..67131721 --- /dev/null +++ b/configs/minari/offline/bc/hammer/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-hammer-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/hammer/expert_v1.yaml b/configs/minari/offline/bc/hammer/expert_v1.yaml new file mode 100644 index 00000000..ecdbb2c6 --- /dev/null +++ b/configs/minari/offline/bc/hammer/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-hammer-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/hammer/human_v1.yaml b/configs/minari/offline/bc/hammer/human_v1.yaml new file mode 100644 index 00000000..9e170b20 --- /dev/null +++ b/configs/minari/offline/bc/hammer/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-hammer-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/pen/cloned_v1.yaml b/configs/minari/offline/bc/pen/cloned_v1.yaml new file mode 100644 index 00000000..e0365282 --- /dev/null +++ b/configs/minari/offline/bc/pen/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-pen-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/pen/expert_v1.yaml b/configs/minari/offline/bc/pen/expert_v1.yaml new file mode 100644 index 00000000..deee7c50 --- /dev/null +++ b/configs/minari/offline/bc/pen/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-pen-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/pen/human_v1.yaml b/configs/minari/offline/bc/pen/human_v1.yaml new file mode 100644 index 00000000..3e416a58 --- /dev/null +++ b/configs/minari/offline/bc/pen/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-pen-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc/relocate/cloned_v1.yaml b/configs/minari/offline/bc/relocate/cloned_v1.yaml new file mode 100644 index 00000000..e8abc4b7 --- /dev/null +++ b/configs/minari/offline/bc/relocate/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-relocate-cloned-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/relocate/expert_v1.yaml b/configs/minari/offline/bc/relocate/expert_v1.yaml new file mode 100644 index 00000000..4566ba23 --- /dev/null +++ b/configs/minari/offline/bc/relocate/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-relocate-expert-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc/relocate/human_v1.yaml b/configs/minari/offline/bc/relocate/human_v1.yaml new file mode 100644 index 00000000..083ea2cf --- /dev/null +++ b/configs/minari/offline/bc/relocate/human_v1.yaml @@ -0,0 +1,16 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-relocate-human-v1-multiseed-v0 +name: bc +normalize_state: true +project: CORL +top_fraction: 1.0 +train_seed: 0 +update_steps: 1000000 + diff --git a/configs/minari/offline/bc_10/door/cloned_v1.yaml b/configs/minari/offline/bc_10/door/cloned_v1.yaml new file mode 100644 index 00000000..7fc0ea4e --- /dev/null +++ b/configs/minari/offline/bc_10/door/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-door-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/door/expert_v1.yaml b/configs/minari/offline/bc_10/door/expert_v1.yaml new file mode 100644 index 00000000..0c6384b8 --- /dev/null +++ b/configs/minari/offline/bc_10/door/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-door-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/door/human_v1.yaml b/configs/minari/offline/bc_10/door/human_v1.yaml new file mode 100644 index 00000000..8b976bf9 --- /dev/null +++ b/configs/minari/offline/bc_10/door/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-door-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/hammer/cloned_v1.yaml b/configs/minari/offline/bc_10/hammer/cloned_v1.yaml new file mode 100644 index 00000000..dfbd9583 --- /dev/null +++ b/configs/minari/offline/bc_10/hammer/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-hammer-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/hammer/expert_v1.yaml b/configs/minari/offline/bc_10/hammer/expert_v1.yaml new file mode 100644 index 00000000..be2c8183 --- /dev/null +++ b/configs/minari/offline/bc_10/hammer/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-hammer-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/hammer/human_v1.yaml b/configs/minari/offline/bc_10/hammer/human_v1.yaml new file mode 100644 index 00000000..aba9df8f --- /dev/null +++ b/configs/minari/offline/bc_10/hammer/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-hammer-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/pen/cloned_v1.yaml b/configs/minari/offline/bc_10/pen/cloned_v1.yaml new file mode 100644 index 00000000..fd4b66bb --- /dev/null +++ b/configs/minari/offline/bc_10/pen/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-pen-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/pen/expert_v1.yaml b/configs/minari/offline/bc_10/pen/expert_v1.yaml new file mode 100644 index 00000000..f7145cae --- /dev/null +++ b/configs/minari/offline/bc_10/pen/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-pen-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/pen/human_v1.yaml b/configs/minari/offline/bc_10/pen/human_v1.yaml new file mode 100644 index 00000000..ac3e3625 --- /dev/null +++ b/configs/minari/offline/bc_10/pen/human_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-pen-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 \ No newline at end of file diff --git a/configs/minari/offline/bc_10/relocate/cloned_v1.yaml b/configs/minari/offline/bc_10/relocate/cloned_v1.yaml new file mode 100644 index 00000000..55755ab8 --- /dev/null +++ b/configs/minari/offline/bc_10/relocate/cloned_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-relocate-cloned-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/relocate/expert_v1.yaml b/configs/minari/offline/bc_10/relocate/expert_v1.yaml new file mode 100644 index 00000000..808885ac --- /dev/null +++ b/configs/minari/offline/bc_10/relocate/expert_v1.yaml @@ -0,0 +1,15 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-relocate-expert-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 diff --git a/configs/minari/offline/bc_10/relocate/human_v1.yaml b/configs/minari/offline/bc_10/relocate/human_v1.yaml new file mode 100644 index 00000000..88f9d42b --- /dev/null +++ b/configs/minari/offline/bc_10/relocate/human_v1.yaml @@ -0,0 +1,16 @@ +batch_size: 256 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 1.0 +group: bc-10-relocate-human-v1-multiseed-v0 +name: bc-10 +normalize_state: true +project: CORL +top_fraction: 0.1 +train_seed: 0 +update_steps: 1000000 + diff --git a/configs/minari/offline/iql/door/cloned_v1.yaml b/configs/minari/offline/iql/door/cloned_v1.yaml new file mode 100644 index 00000000..e48f5312 --- /dev/null +++ b/configs/minari/offline/iql/door/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-door-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/door/expert_v1.yaml b/configs/minari/offline/iql/door/expert_v1.yaml new file mode 100644 index 00000000..b63686d3 --- /dev/null +++ b/configs/minari/offline/iql/door/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-door-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/door/human_v1.yaml b/configs/minari/offline/iql/door/human_v1.yaml new file mode 100644 index 00000000..eb402b94 --- /dev/null +++ b/configs/minari/offline/iql/door/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: door-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-door-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/hammer/cloned_v1.yaml b/configs/minari/offline/iql/hammer/cloned_v1.yaml new file mode 100644 index 00000000..099e8f55 --- /dev/null +++ b/configs/minari/offline/iql/hammer/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-hammer-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/hammer/expert_v1.yaml b/configs/minari/offline/iql/hammer/expert_v1.yaml new file mode 100644 index 00000000..0c2b968e --- /dev/null +++ b/configs/minari/offline/iql/hammer/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-hammer-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/hammer/human_v1.yaml b/configs/minari/offline/iql/hammer/human_v1.yaml new file mode 100644 index 00000000..08883e95 --- /dev/null +++ b/configs/minari/offline/iql/hammer/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: hammer-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-hammer-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/pen/cloned_v1.yaml b/configs/minari/offline/iql/pen/cloned_v1.yaml new file mode 100644 index 00000000..8dc39339 --- /dev/null +++ b/configs/minari/offline/iql/pen/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-pen-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/pen/expert_v1.yaml b/configs/minari/offline/iql/pen/expert_v1.yaml new file mode 100644 index 00000000..56b1db79 --- /dev/null +++ b/configs/minari/offline/iql/pen/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-pen-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 \ No newline at end of file diff --git a/configs/minari/offline/iql/pen/human_v1.yaml b/configs/minari/offline/iql/pen/human_v1.yaml new file mode 100644 index 00000000..199e24a3 --- /dev/null +++ b/configs/minari/offline/iql/pen/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: pen-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-pen-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/relocate/cloned_v1.yaml b/configs/minari/offline/iql/relocate/cloned_v1.yaml new file mode 100644 index 00000000..1acd1fa6 --- /dev/null +++ b/configs/minari/offline/iql/relocate/cloned_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-cloned-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-relocate-cloned-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/relocate/expert_v1.yaml b/configs/minari/offline/iql/relocate/expert_v1.yaml new file mode 100644 index 00000000..012815e7 --- /dev/null +++ b/configs/minari/offline/iql/relocate/expert_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-expert-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-relocate-expert-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 diff --git a/configs/minari/offline/iql/relocate/human_v1.yaml b/configs/minari/offline/iql/relocate/human_v1.yaml new file mode 100644 index 00000000..d0416acf --- /dev/null +++ b/configs/minari/offline/iql/relocate/human_v1.yaml @@ -0,0 +1,23 @@ +actor_dropout: 0.1 +actor_lr: 0.0003 +batch_size: 256 +beta: 3.0 +buffer_size: 1000000 +checkpoints_path: null +dataset_id: relocate-human-v1 +eval_episodes: 10 +eval_every: 5000 +eval_seed: 0 +gamma: 0.99 +group: iql-relocate-human-v1-multiseed-v0 +iql_deterministic: false +iql_tau: 0.8 +name: iql +normalize_reward: false +normalize_state: true +project: CORL +qf_lr: 0.0003 +tau: 0.005 +train_seed: 0 +update_steps: 1000000 +vf_lr: 0.0003 From 88ecaec28d0d361c124661b511fb087a7d75e9e7 Mon Sep 17 00:00:00 2001 From: Howuhh Date: Wed, 19 Jul 2023 17:49:20 +0300 Subject: [PATCH 11/11] linter fix --- algorithms/minari/any_percent_bc.py | 7 +++---- algorithms/minari/iql.py | 9 ++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/algorithms/minari/any_percent_bc.py b/algorithms/minari/any_percent_bc.py index d7ca44b3..ff3d43c1 100644 --- a/algorithms/minari/any_percent_bc.py +++ b/algorithms/minari/any_percent_bc.py @@ -1,9 +1,9 @@ -from typing import Any, Dict, List, Optional, Tuple, Union import contextlib -from dataclasses import asdict, dataclass import os import random import uuid +from dataclasses import asdict, dataclass +from typing import Any, Dict, List, Optional, Tuple, Union import gymnasium as gym import minari @@ -12,9 +12,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from tqdm.auto import trange - import wandb +from tqdm.auto import trange TensorBatch = List[torch.Tensor] DEVICE = "cuda" if torch.cuda.is_available() else "cpu" diff --git a/algorithms/minari/iql.py b/algorithms/minari/iql.py index 9902eaa6..4f449622 100644 --- a/algorithms/minari/iql.py +++ b/algorithms/minari/iql.py @@ -5,27 +5,26 @@ # 1. iql_deterministic is true only for 2 datasets. Can we remote it? # 2. MLP class introduced bugs in the past. We should remove it. # 3. Refactor IQL updating code to be more consistent in style -from typing import Any, Callable, Dict, List, Optional, Tuple, Union import contextlib import copy -from dataclasses import asdict, dataclass import os import random import uuid +from dataclasses import asdict, dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import gymnasium as gym import minari import numpy as np import pyrallis import torch -from torch.distributions import Normal import torch.nn as nn import torch.nn.functional as F +import wandb +from torch.distributions import Normal from torch.optim.lr_scheduler import CosineAnnealingLR from tqdm.auto import trange -import wandb - TensorBatch = List[torch.Tensor] DEVICE = "cuda" if torch.cuda.is_available() else "cpu"