diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..0481485d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,25 @@ +name: ci +on: + push: + branches: + - main + - howuhh/docs-wip +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v3 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material + - run: mkdocs gh-deploy --force diff --git a/.gitignore b/.gitignore index ce35dd32..03469487 100644 --- a/.gitignore +++ b/.gitignore @@ -145,4 +145,4 @@ dmypy.json .json .yaml wandb -assets/ \ No newline at end of file +#assets/ \ No newline at end of file diff --git a/README.md b/README.md index 2792adfc..36f74e2b 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,25 @@ 🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations of SOTA ORL algorithms. Each implementation is backed by a research-friendly codebase, allowing you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL, check them out too!
* 📜 Single-file implementation -* 📈 Benchmarked Implementation for N algorithms +* 📈 Benchmarked Implementation (11+ offline algorithms, 5+ offline-to-online algorithms, 30+ datasets with detailed logs) * 🖼 [Weights and Biases](https://wandb.ai/site) integration +You can read more about CORL design and main results in our [technical paper](https://arxiv.org/abs/2210.07105). + ---- * ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/corl-team/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment. ---- +> ⚠️ **NOTE**: CORL (similarily to CleanRL) is not a modular library and therefore it is not meant to be imported. +At the cost of duplicate code, we make all implementation details of an ORL algorithm variant easy +to understand. You should consider using CORL if you want to 1) understand and control all implementation details +of an algorithm or 2) rapidly prototype advanced features that other modular ORL libraries do not support. + ## Getting started +Please refer to the [documentation](https://corl-team.github.io/CORL/get-started/install/) for more details. TLDR: + ```bash git clone https://github.com/corl-team/CORL.git && cd CORL pip install -r requirements/requirements_dev.txt @@ -213,7 +222,7 @@ If you use CORL in your work, please use the following bibtex ```bibtex @inproceedings{ tarasov2022corl, - title={{CORL}: Research-oriented Deep Offline Reinforcement Learning Library}, + title={CORL: Research-oriented Deep Offline Reinforcement Learning Library}, author={Denis Tarasov and Alexander Nikulin and Dmitry Akimov and Vladislav Kurenkov and Sergey Kolesnikov}, booktitle={3rd Offline RL Workshop: Offline RL as a ''Launchpad''}, year={2022}, diff --git a/algorithms/offline/any_percent_bc.py b/algorithms/offline/any_percent_bc.py index edacc43e..7b6dfa83 100644 --- a/algorithms/offline/any_percent_bc.py +++ b/algorithms/offline/any_percent_bc.py @@ -19,26 +19,40 @@ @dataclass class TrainConfig: - # Experiment - device: str = "cuda" - env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name - seed: int = 0 # Sets Gym, PyTorch and Numpy seeds - eval_freq: int = int(5e3) # How often (time steps) we evaluate - n_episodes: int = 10 # How many episodes run during evaluation - max_timesteps: int = int(1e6) # Max time steps to run environment - checkpoints_path: Optional[str] = None # Save path - load_model: str = "" # Model load file name, "" doesn't load - batch_size: int = 256 # Batch size for all networks - discount: float = 0.99 # Discount factor - # BC - buffer_size: int = 2_000_000 # Replay buffer size - frac: float = 0.1 # Best data fraction to use - max_traj_len: int = 1000 # Max trajectory length - normalize: bool = True # Normalize states - # Wandb logging + # wandb project name project: str = "CORL" + # wandb group name group: str = "BC-D4RL" + # wandb run name name: str = "BC" + # training dataset and evaluation environment + env: str = "halfcheetah-medium-expert-v2" + # total gradient updates during training + max_timesteps: int = int(1e6) + # training batch size + batch_size: int = 256 + # maximum size of the replay buffer + buffer_size: int = 2_000_000 + # what top fraction of the dataset (sorted by return) to use + frac: float = 0.1 + # maximum possible trajectory length + max_traj_len: int = 1000 + # whether to normalize states + normalize: bool = True + # discount factor + discount: float = 0.99 + # evaluation frequency, will evaluate eval_freq training steps + eval_freq: int = int(5e3) + # number of episodes to run during evaluation + n_episodes: int = 10 + # path for checkpoints saving, optional + checkpoints_path: Optional[str] = None + # file name for loading a model, optional + load_model: str = "" + # training random seed + seed: int = 0 + # training device + device: str = "cuda" def __post_init__(self): self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" diff --git a/algorithms/offline/awac.py b/algorithms/offline/awac.py index 2c652de9..2fb3bf10 100644 --- a/algorithms/offline/awac.py +++ b/algorithms/offline/awac.py @@ -20,29 +20,49 @@ @dataclass class TrainConfig: + # wandb project name project: str = "CORL" + # wandb group name group: str = "AWAC-D4RL" + # wandb run name name: str = "AWAC" - checkpoints_path: Optional[str] = None - + # training dataset and evaluation environment env_name: str = "halfcheetah-medium-expert-v2" - seed: int = 42 - test_seed: int = 69 - deterministic_torch: bool = False - device: str = "cuda" - - buffer_size: int = 2_000_000 - num_train_ops: int = 1_000_000 - batch_size: int = 256 - eval_frequency: int = 1000 - n_test_episodes: int = 10 - normalize_reward: bool = False - + # actor and critic hidden dim hidden_dim: int = 256 + # actor and critic learning rate learning_rate: float = 3e-4 + # discount factor gamma: float = 0.99 + # coefficient for the target critic Polyak's update tau: float = 5e-3 + # awac actor loss temperature, controlling balance + # between behaviour cloning and Q-value maximization awac_lambda: float = 1.0 + # total number of gradient updated during training + num_train_ops: int = 1_000_000 + # training batch size + batch_size: int = 256 + # maximum size of the replay buffer + buffer_size: int = 2_000_000 + # whether to normalize reward (like in IQL) + normalize_reward: bool = False + # evaluation frequency, will evaluate every eval_frequency + # training steps + eval_frequency: int = 1000 + # number of episodes to run during evaluation + n_test_episodes: int = 10 + # path for checkpoints saving, optional + checkpoints_path: Optional[str] = None + # configure PyTorch to use deterministic algorithms instead + # of nondeterministic ones + deterministic_torch: bool = False + # training random seed + seed: int = 42 + # evaluation random seed + test_seed: int = 69 + # training device + device: str = "cuda" def __post_init__(self): self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}" diff --git a/algorithms/offline/cql.py b/algorithms/offline/cql.py index a1470eb0..307ffc08 100644 --- a/algorithms/offline/cql.py +++ b/algorithms/offline/cql.py @@ -23,7 +23,6 @@ @dataclass class TrainConfig: - # Experiment device: str = "cuda" env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name seed: int = 0 # Sets Gym, PyTorch and Numpy seeds @@ -32,8 +31,6 @@ class TrainConfig: max_timesteps: int = int(1e6) # Max time steps to run environment checkpoints_path: Optional[str] = None # Save path load_model: str = "" # Model load file name, "" doesn't load - - # CQL buffer_size: int = 2_000_000 # Replay buffer size batch_size: int = 256 # Batch size for all networks discount: float = 0.99 # Discount factor @@ -59,9 +56,7 @@ class TrainConfig: q_n_hidden_layers: int = 3 # Number of hidden layers in Q networks reward_scale: float = 1.0 # Reward scale for normalization reward_bias: float = 0.0 # Reward bias for normalization - - # AntMaze hacks - bc_steps: int = int(0) # Number of BC steps at start + bc_steps: int = int(0) # Number of BC steps at start (AntMaze hacks) reward_scale: float = 5.0 reward_bias: float = -1.0 policy_log_std_multiplier: float = 1.0 diff --git a/algorithms/offline/dt.py b/algorithms/offline/dt.py index 37c61e67..367c337e 100644 --- a/algorithms/offline/dt.py +++ b/algorithms/offline/dt.py @@ -1,5 +1,5 @@ # inspiration: -# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py # noqa +# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py # 2. https://github.com/karpathy/minGPT import os import random @@ -17,44 +17,70 @@ import wandb from torch.nn import functional as F from torch.utils.data import DataLoader, IterableDataset -from tqdm.auto import tqdm, trange # noqa +from tqdm.auto import trange @dataclass class TrainConfig: - # wandb params + # wandb project name project: str = "CORL" + # wandb group name group: str = "DT-D4RL" + # wandb run name name: str = "DT" - # model params + # transformer hidden dim embedding_dim: int = 128 + # depth of the transformer model num_layers: int = 3 + # number of heads in the attention num_heads: int = 1 + # maximum sequence length during training seq_len: int = 20 + # maximum rollout length, needed for the positional embeddings episode_len: int = 1000 + # attention dropout attention_dropout: float = 0.1 + # residual dropout residual_dropout: float = 0.1 + # embeddings dropout embedding_dropout: float = 0.1 + # maximum range for the symmetric actions, [-1, 1] max_action: float = 1.0 - # training params + # training dataset and evaluation environment env_name: str = "halfcheetah-medium-v2" + # AdamW optimizer learning rate learning_rate: float = 1e-4 + # AdamW optimizer betas betas: Tuple[float, float] = (0.9, 0.999) + # AdamW weight decay weight_decay: float = 1e-4 + # maximum gradient norm during training, optional clip_grad: Optional[float] = 0.25 + # training batch size batch_size: int = 64 + # total training steps update_steps: int = 100_000 + # warmup steps for the learning rate scheduler warmup_steps: int = 10_000 + # reward scaling, to reduce the magnitude reward_scale: float = 0.001 + # number of workers for the pytorch dataloader num_workers: int = 4 - # evaluation params + # target return-to-go for the prompting durint evaluation target_returns: Tuple[float, ...] = (12000.0, 6000.0) + # number of episodes to run during evaluation eval_episodes: int = 100 + # evaluation frequency, will evaluate eval_every training steps eval_every: int = 10_000 - # general params + # path for checkpoints saving, optional checkpoints_path: Optional[str] = None + # configure PyTorch to use deterministic algorithms instead + # of nondeterministic ones deterministic_torch: bool = False + # training random seed train_seed: int = 10 + # evaluation random seed eval_seed: int = 42 + # training device device: str = "cuda" def __post_init__(self): @@ -180,7 +206,7 @@ def __prepare_sample(self, traj_idx, start_idx): states = (states - self.state_mean) / self.state_std returns = returns * self.reward_scale - # pad up to seq_len if needed + # pad up to seq_len if needed, padding is masked during training mask = np.hstack( [np.ones(states.shape[0]), np.zeros(self.seq_len - states.shape[0])] ) diff --git a/algorithms/offline/edac.py b/algorithms/offline/edac.py index 413801c9..b668e43f 100644 --- a/algorithms/offline/edac.py +++ b/algorithms/offline/edac.py @@ -21,36 +21,58 @@ @dataclass class TrainConfig: - # wandb params + # wandb project name project: str = "CORL" + # wandb group name group: str = "EDAC-D4RL" + # wandb run name name: str = "EDAC" - # model params + # actor and critic hidden dim hidden_dim: int = 256 + # critic ensemble size num_critics: int = 10 + # discount factor gamma: float = 0.99 + # coefficient for the target critic Polyak's update tau: float = 5e-3 + # coefficient for the ensemble diversification loss eta: float = 1.0 + # actor learning rate actor_learning_rate: float = 3e-4 + # critic learning rate critic_learning_rate: float = 3e-4 + # alpha learning rate alpha_learning_rate: float = 3e-4 + # maximum range for the symmetric actions, [-1, 1] max_action: float = 1.0 - # training params + # maximum size of the replay buffer buffer_size: int = 1_000_000 + # training dataset and evaluation environment env_name: str = "halfcheetah-medium-v2" + # training batch size batch_size: int = 256 + # total number of training epochs num_epochs: int = 3000 + # number of gradient updates during one epoch num_updates_on_epoch: int = 1000 + # whether to normalize reward (like in IQL) normalize_reward: bool = False - # evaluation params + # number of episodes to run during evaluation eval_episodes: int = 10 + # evaluation frequency, will evaluate eval_every training steps eval_every: int = 5 - # general params + # path for checkpoints saving, optional checkpoints_path: Optional[str] = None + # configure PyTorch to use deterministic algorithms instead + # of nondeterministic ones deterministic_torch: bool = False + # training random seed train_seed: int = 10 + # evaluation random seed eval_seed: int = 42 + # frequency of metrics logging to the wandb log_every: int = 100 + # training device device: str = "cpu" def __post_init__(self): diff --git a/algorithms/offline/iql.py b/algorithms/offline/iql.py index 72c28726..14cec70a 100644 --- a/algorithms/offline/iql.py +++ b/algorithms/offline/iql.py @@ -29,33 +29,55 @@ @dataclass class TrainConfig: - # Experiment - device: str = "cuda" - env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name - seed: int = 0 # Sets Gym, PyTorch and Numpy seeds - eval_freq: int = int(5e3) # How often (time steps) we evaluate - n_episodes: int = 10 # How many episodes run during evaluation - max_timesteps: int = int(1e6) # Max time steps to run environment - checkpoints_path: Optional[str] = None # Save path - load_model: str = "" # Model load file name, "" doesn't load - # IQL - buffer_size: int = 2_000_000 # Replay buffer size - batch_size: int = 256 # Batch size for all networks - discount: float = 0.99 # Discount factor - tau: float = 0.005 # Target network update rate - beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q - iql_tau: float = 0.7 # Coefficient for asymmetric loss - iql_deterministic: bool = False # Use deterministic actor - normalize: bool = True # Normalize states - normalize_reward: bool = False # Normalize reward - vf_lr: float = 3e-4 # V function learning rate - qf_lr: float = 3e-4 # Critic learning rate - actor_lr: float = 3e-4 # Actor learning rate - actor_dropout: Optional[float] = None # Adroit uses dropout for policy network - # Wandb logging + # wandb project name project: str = "CORL" + # wandb group name group: str = "IQL-D4RL" + # wandb run name name: str = "IQL" + # training dataset and evaluation environment + env: str = "halfcheetah-medium-expert-v2" + # discount factor + discount: float = 0.99 + # coefficient for the target critic Polyak's update + tau: float = 0.005 + # actor update inverse temperature, similar to AWAC + # small beta -> BC, big beta -> maximizing Q-value + beta: float = 3.0 + # coefficient for asymmetric critic loss + iql_tau: float = 0.7 + # whether to use deterministic actor + iql_deterministic: bool = False + # total gradient updates during training + max_timesteps: int = int(1e6) + # maximum size of the replay buffer + buffer_size: int = 2_000_000 + # training batch size + batch_size: int = 256 + # whether to normalize states + normalize: bool = True + # whether to normalize reward (like in IQL) + normalize_reward: bool = False + # V-critic function learning rate + vf_lr: float = 3e-4 + # Q-critic learning rate + qf_lr: float = 3e-4 + # actor learning rate + actor_lr: float = 3e-4 + # where to use dropout for policy network, optional + actor_dropout: Optional[float] = None + # evaluation frequency, will evaluate every eval_freq training steps + eval_freq: int = int(5e3) + # number of episodes to run during evaluation + n_episodes: int = 10 + # path for checkpoints saving, optional + checkpoints_path: Optional[str] = None + # file name for loading a model, optional + load_model: str = "" + # training random seed + seed: int = 0 + # training device + device: str = "cuda" def __post_init__(self): self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" diff --git a/algorithms/offline/lb_sac.py b/algorithms/offline/lb_sac.py index 71fb8c54..4f10f77e 100644 --- a/algorithms/offline/lb_sac.py +++ b/algorithms/offline/lb_sac.py @@ -23,36 +23,58 @@ # base learning rate: 3e-4 @dataclass class TrainConfig: - # wandb params + # wandb project name project: str = "CORL" + # wandb group name group: str = "LB-SAC" + # wandb run name name: str = "LB-SAC" - # model params + # actor and critic hidden dim hidden_dim: int = 256 + # critic ensemble size num_critics: int = 10 + # discount factor gamma: float = 0.99 + # coefficient for the target critic Polyak's update tau: float = 5e-3 + # actor learning rate (before scaling was 3e-4) actor_learning_rate: float = 0.0018 + # critic learning rate (before scaling was 3e-4) critic_learning_rate: float = 0.0018 + # alpha learning rate (before scaling was 3e-4) alpha_learning_rate: float = 0.0018 + # whether to use layer normalization for critic critic_layernorm: bool = False + # whether to use initialization from EDAC paper edac_init: bool = False + # maximum range for the symmetric actions, [-1, 1] max_action: float = 1.0 - # training params + # maximum size of the replay buffer buffer_size: int = 1_000_000 + # training dataset and evaluation environment env_name: str = "halfcheetah-medium-v2" + # training batch size batch_size: int = 10_000 + # total number of training epochs num_epochs: int = 300 + # number of gradient updates during one epoch num_updates_on_epoch: int = 1000 - # evaluation params + # number of episodes to run during evaluation eval_episodes: int = 10 + # evaluation frequency, will evaluate eval_every training steps eval_every: int = 5 - # general params + # path for checkpoints saving, optional checkpoints_path: Optional[str] = None + # configure PyTorch to use deterministic algorithms instead + # of nondeterministic ones deterministic_torch: bool = False + # training random seed train_seed: int = 10 + # evaluation random seed eval_seed: int = 42 + # frequency of metrics logging to the wandb log_every: int = 100 + # training device device: str = "cpu" def __post_init__(self): diff --git a/algorithms/offline/sac_n.py b/algorithms/offline/sac_n.py index 0b91ddec..a44da091 100644 --- a/algorithms/offline/sac_n.py +++ b/algorithms/offline/sac_n.py @@ -21,36 +21,57 @@ @dataclass class TrainConfig: - # wandb params + # wandb project name project: str = "CORL" + # wandb group name group: str = "SAC-N" + # wandb run name name: str = "SAC-N" - # model params + # actor and critic hidden dim hidden_dim: int = 256 + # critic ensemble size num_critics: int = 10 + # discount factor gamma: float = 0.99 + # coefficient for the target critic Polyak's update tau: float = 5e-3 + # actor learning rate actor_learning_rate: float = 3e-4 + # critic learning rate critic_learning_rate: float = 3e-4 + # entropy coefficient learning rate for automatic tuning alpha_learning_rate: float = 3e-4 + # maximum range for the symmetric actions, [-1, 1] max_action: float = 1.0 - # training params + # maximum size of the replay buffer buffer_size: int = 1_000_000 + # training dataset and evaluation environment env_name: str = "halfcheetah-medium-v2" + # training batch size batch_size: int = 256 + # total number of training epochs num_epochs: int = 3000 + # number of gradient updates during one epoch num_updates_on_epoch: int = 1000 + # whether to normalize reward (like in IQL) normalize_reward: bool = False - # evaluation params + # number of episodes to run during evaluation eval_episodes: int = 10 + # evaluation frequency, will evaluate eval_every training steps eval_every: int = 5 - # general params + # path for checkpoints saving, optional checkpoints_path: Optional[str] = None + # configure PyTorch to use deterministic algorithms instead + # of nondeterministic ones deterministic_torch: bool = False + # training random seed train_seed: int = 10 + # evaluation random seed eval_seed: int = 42 + # frequency of metrics logging to the wandb log_every: int = 100 - device: str = "cpu" + # training device + device: str = "cuda" def __post_init__(self): self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}" @@ -465,6 +486,8 @@ def eval_actor( return np.array(episode_rewards) +# normalization like in the IQL paper +# https://github.com/ikostrikov/implicit_q_learning/blob/09d700248117881a75cb21f0adb95c6c8a694cb2/train_offline.py#L35 # noqa def return_reward_range(dataset, max_episode_steps): returns, lengths = [], [] ep_ret, ep_len = 0.0, 0 diff --git a/algorithms/offline/td3_bc.py b/algorithms/offline/td3_bc.py index a78bda30..6ae0379c 100644 --- a/algorithms/offline/td3_bc.py +++ b/algorithms/offline/td3_bc.py @@ -22,32 +22,51 @@ @dataclass class TrainConfig: - # Experiment - device: str = "cuda" - env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name - seed: int = 0 # Sets Gym, PyTorch and Numpy seeds - eval_freq: int = int(5e3) # How often (time steps) we evaluate - n_episodes: int = 10 # How many episodes run during evaluation - max_timesteps: int = int(1e6) # Max time steps to run environment - checkpoints_path: Optional[str] = None # Save path - load_model: str = "" # Model load file name, "" doesn't load - # TD3 - buffer_size: int = 2_000_000 # Replay buffer size - batch_size: int = 256 # Batch size for all networks - discount: float = 0.99 # Discount ffor - expl_noise: float = 0.1 # Std of Gaussian exploration noise - tau: float = 0.005 # Target network update rate - policy_noise: float = 0.2 # Noise added to target actor during critic update - noise_clip: float = 0.5 # Range to clip target actor noise - policy_freq: int = 2 # Frequency of delayed actor updates - # TD3 + BC - alpha: float = 2.5 # Coefficient for Q function in actor loss - normalize: bool = True # Normalize states - normalize_reward: bool = False # Normalize reward - # Wandb logging + # wandb project name project: str = "CORL" + # wandb group name group: str = "TD3_BC-D4RL" + # wandb run name name: str = "TD3_BC" + # training dataset and evaluation environment + env: str = "halfcheetah-medium-expert-v2" + # coefficient for the Q-function in actor loss + alpha: float = 2.5 + # discount factor + discount: float = 0.99 + # standard deviation for the gaussian exploration noise + expl_noise: float = 0.1 + # coefficient for the target critic Polyak's update + tau: float = 0.005 + # scalig coefficient for the noise added to + # target actor during critic update + policy_noise: float = 0.2 + # range for the target actor noise clipping + noise_clip: float = 0.5 + # actor update delay + policy_freq: int = 2 + # total gradient updates during training + max_timesteps: int = int(1e6) + # maximum size of the replay buffer + buffer_size: int = 2_000_000 + # training batch size + batch_size: int = 256 + # whether to normalize states + normalize: bool = True + # whether to normalize reward (like in IQL) + normalize_reward: bool = False + # evaluation frequency, will evaluate every eval_freq training steps + eval_freq: int = int(5e3) + # number of episodes to run during evaluation + n_episodes: int = 10 + # path for checkpoints saving, optional + checkpoints_path: Optional[str] = None + # file name for loading a model, optional + load_model: str = "" + # training random seed + seed: int = 0 + # training device + device: str = "cuda" def __post_init__(self): self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" diff --git a/docs/algorithms/awac.md b/docs/algorithms/awac.md new file mode 100644 index 00000000..94731ee0 --- /dev/null +++ b/docs/algorithms/awac.md @@ -0,0 +1 @@ +# AWAC diff --git a/docs/algorithms/bc.md b/docs/algorithms/bc.md new file mode 100644 index 00000000..7f00d30c --- /dev/null +++ b/docs/algorithms/bc.md @@ -0,0 +1 @@ +# BC \ No newline at end of file diff --git a/docs/algorithms/cal-ql.md b/docs/algorithms/cal-ql.md new file mode 100644 index 00000000..798de1ef --- /dev/null +++ b/docs/algorithms/cal-ql.md @@ -0,0 +1 @@ +# Cal-QL diff --git a/docs/algorithms/cql.md b/docs/algorithms/cql.md new file mode 100644 index 00000000..194cc7a1 --- /dev/null +++ b/docs/algorithms/cql.md @@ -0,0 +1 @@ +# CQL \ No newline at end of file diff --git a/docs/algorithms/dt.md b/docs/algorithms/dt.md new file mode 100644 index 00000000..e3218c9d --- /dev/null +++ b/docs/algorithms/dt.md @@ -0,0 +1,130 @@ +--- +hide: + - toc # Hide table of contents +--- + +# DT + +## Overview + +The Decision Transformer (DT) model casts offline reinforcement learning as a conditional sequence modeling problem. + +Unlike prior approaches to offline RL that fit value functions or compute policy gradients, Decision Transformer simply outputs the optimal +actions by leveraging a causally masked Transformer. By conditioning an autoregressive model on the desired return +(reward-to-go), past states, and actions, Decision Transformer model can generate future actions that achieve the desired return. + +Original paper: + + * [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) + * [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) + (similar approach, came out at the same time) + +Reference resources: + +* :material-github: [Official codebase for Decision Transformer](https://github.com/kzl/decision-transformer) + +!!! success + Due to the simple supervised objective and transformer architecture, Decision Transformer is simple, stable and easy to implement as it + has a minimum number of moving parts. + +!!! warning + Despite its simplicity and stability, DT has a number of drawbacks. It does not capable of stitching suboptimal + trajectories (that's why poor performance on AntMaze datasets), and can also [show](https://arxiv.org/abs/2205.15967) bad performance in stochastic environments. + +Possible extensions: + +* [Online Decision Transformer](https://arxiv.org/abs/2202.05607) +* [Emergent Agentic Transformer from Chain of Hindsight Experience](https://arxiv.org/abs/2305.16554) +* [Q-learning Decision Transformer: Leveraging Dynamic Programming for Conditional Sequence Modelling in Offline RL](https://proceedings.mlr.press/v202/yamagata23a.html) + +We'd be glad if someone would be interested in contributing them! + +## Implemented Variants + +| Variants Implemented | Description | +|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------| +| :material-github: [`offline/dt.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/dt) | For continuous action spaces and offline RL without fine-tuning support. | + + +## Explanation of logged metrics + +* `eval/{target_return}_return_mean`: mean undiscounted evaluation return when prompted with `config.target_return` value (there might be more than one) +* `eval/{target_return}_return_std`: standard deviation of the undiscounted evaluation return across `config.eval_episodes` episodes +* `eval/{target_return}_normalized_score_mean`: mean normalized score when prompted with `config.target_return` value (there might be more than one). + Should be between 0 and 100, where 100+ is the performance above expert for this environment. + Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)]. +* `eval/{target_return}_normalized_score_std`: standard deviation of the normalized score return across `config.eval_episodes` episodes +* `train_loss`: current training loss, Mean squared error (MSE) for continuous action spaces +* `learning_rate`: current learning rate, helps monitor learning rate schedule + +## Implementation details + +1. Batch sampling weighted by trajectory length (:material-github: [algorithms/offline/dt.py#L171](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L171)) +2. State normalization during training and inference (:material-github: [algorithms/offline/dt.py#L181](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L181)) +3. Reward downscaling (:material-github: [algorithms/offline/dt.py#L182](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L182)) +4. Positional embedding shared across one transition (:material-github: [algorithms/offline/dt.py#L323](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L323)) +5. Prompting with multiple return-to-go's during evaluation, as DT can be sensitive to the prompt (:material-github: [algorithms/offline/dt.py#L498](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L498)) + +## Experimental results + +For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md). +Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly. + + + +## Training options + +```commandline +usage: dt.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--embedding_dim int] [--num_layers int] + [--num_heads int] [--seq_len int] [--episode_len int] [--attention_dropout float] [--residual_dropout float] + [--embedding_dropout float] [--max_action float] [--env_name str] [--learning_rate float] + [--betas float float] [--weight_decay float] [--clip_grad [float]] [--batch_size int] [--update_steps int] + [--warmup_steps int] [--reward_scale float] [--num_workers int] [--target_returns float [float, ...]] + [--eval_episodes int] [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool] + [--train_seed int] [--eval_seed int] [--device str] + +optional arguments: + -h, --help show this help message and exit + --config_path str Path for a config file to parse with pyrallis (default: None) + +TrainConfig: + + --project str wandb project name (default: CORL) + --group str wandb group name (default: DT-D4RL) + --name str wandb run name (default: DT) + --embedding_dim int transformer hidden dim (default: 128) + --num_layers int depth of the transformer model (default: 3) + --num_heads int number of heads in the attention (default: 1) + --seq_len int maximum sequence length during training (default: 20) + --episode_len int maximum rollout length, needed for the positional embeddings (default: 1000) + --attention_dropout float + attention dropout (default: 0.1) + --residual_dropout float + residual dropout (default: 0.1) + --embedding_dropout float + embeddings dropout (default: 0.1) + --max_action float maximum range for the symmetric actions, [-1, 1] (default: 1.0) + --env_name str training dataset and evaluation environment (default: halfcheetah-medium-v2) + --learning_rate float + AdamW optimizer learning rate (default: 0.0001) + --betas float float AdamW optimizer betas (default: (0.9, 0.999)) + --weight_decay float AdamW weight decay (default: 0.0001) + --clip_grad [float] maximum gradient norm during training, optional (default: 0.25) + --batch_size int training batch size (default: 64) + --update_steps int total training steps (default: 100000) + --warmup_steps int warmup steps for the learning rate scheduler (default: 10000) + --reward_scale float reward scaling, to reduce the magnitude (default: 0.001) + --num_workers int number of workers for the pytorch dataloader (default: 4) + --target_returns float [float, ...] + target return-to-go for the prompting durint evaluation (default: (12000.0, 6000.0)) + --eval_episodes int number of episodes to run during evaluation (default: 100) + --eval_every int evaluation frequency, will evaluate eval_every training steps (default: 10000) + --checkpoints_path [str] + path for checkpoints saving, optional (default: None) + --deterministic_torch bool + configure PyTorch to use deterministic algorithms instead of nondeterministic ones (default: False) + --train_seed int training random seed (default: 10) + --eval_seed int evaluation random seed (default: 42) + --device str training device (default: cuda) +``` + diff --git a/docs/algorithms/edac.md b/docs/algorithms/edac.md new file mode 100644 index 00000000..68c4a050 --- /dev/null +++ b/docs/algorithms/edac.md @@ -0,0 +1 @@ +# EDAC diff --git a/docs/algorithms/iql.md b/docs/algorithms/iql.md new file mode 100644 index 00000000..d619a8c4 --- /dev/null +++ b/docs/algorithms/iql.md @@ -0,0 +1 @@ +# IQL \ No newline at end of file diff --git a/docs/algorithms/lb-sac.md b/docs/algorithms/lb-sac.md new file mode 100644 index 00000000..4498896c --- /dev/null +++ b/docs/algorithms/lb-sac.md @@ -0,0 +1 @@ +# LB-SAC diff --git a/docs/algorithms/rebrac.md b/docs/algorithms/rebrac.md new file mode 100644 index 00000000..6978645c --- /dev/null +++ b/docs/algorithms/rebrac.md @@ -0,0 +1 @@ +# ReBRAC \ No newline at end of file diff --git a/docs/algorithms/sac-n.md b/docs/algorithms/sac-n.md new file mode 100644 index 00000000..8c05636a --- /dev/null +++ b/docs/algorithms/sac-n.md @@ -0,0 +1,153 @@ +--- +hide: + - toc # Hide table of contents +--- + +# SAC-N + +## Overview + +SAC-N is a simple extension of well known online Soft Actor Critic (SAC) algorithm. For an overview of online SAC, +see the excellent [documentation at **CleanRL**](https://docs.cleanrl.dev/rl-algorithms/sac/). SAC utilizes a conventional +technique from online RL, Clipped Double Q-learning, which uses the minimum value of two parallel Q-networks +as the Bellman target. SAC-N modifies SAC by increasing the size of the Q-ensemble from $2$ to $N$ to prevent the overestimation. +That's it! + + +Critic loss (change in blue): + +$$ +\min _{\phi_i} \mathbb{E}_{\mathbf{s}, \mathbf{a}, \mathbf{s}^{\prime} \sim \mathcal{D}}\left[\left(Q_{\phi_i}(\mathbf{s}, \mathbf{a})-\left(r(\mathbf{s}, \mathbf{a})+\gamma \mathbb{E}_{\mathbf{a}^{\prime} \sim \pi_\theta\left(\cdot \mid \mathbf{s}^{\prime}\right)}\left[\min _{\color{blue}{j=1, \ldots, N}} Q_{\phi_j^{\prime}}\left(\mathbf{s}^{\prime}, \mathbf{a}^{\prime}\right)-\alpha \log \pi_\theta\left(\mathbf{a}^{\prime} \mid \mathbf{s}^{\prime}\right)\right]\right)\right)^2\right] +$$ + +Actor loss (change in blue): + +$$ +\max _\theta \mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \pi_\theta(\cdot \mid \mathbf{s})}\left[\min _{\color{blue}{j=1, \ldots, N}} Q_{\phi_j}(\mathbf{s}, \mathbf{a})-\alpha \log \pi_\theta(\mathbf{a} \mid \mathbf{s})\right] +$$ + +Why does it work? There is a simple intuition given in the original paper. The clipped Q-learning algorithm, which chooses the +worst-case Q-value instead to compute the pessimistic estimate, can also be interpreted as utilizing the LCB of the Q-value +predictions. Suppose $Q(s, a)$ follows a Gaussian distribution with mean $m(s, a)$ and standard deviation $\sigma(s, a)$. Also, +let $\left\{Q_j(\mathbf{s}, \mathbf{a})\right\}_{j=1}^N$ be realizations of $Q(s, a)$. Then, we can approximate the expected minimum of the realizations as + +$$ +\mathbb{E}\left[\min _{j=1, \ldots, N} Q_j(\mathbf{s}, \mathbf{a})\right] \approx m(\mathbf{s}, \mathbf{a})-\Phi^{-1}\left(\frac{N-\frac{\pi}{8}}{N-\frac{\pi}{4}+1}\right) \sigma(\mathbf{s}, \mathbf{a}) +$$ + +where $\Phi$ is the CDF of the standard Gaussian distribution. This relation indicates that using the clipped Q-value +is similar to penalizing the ensemble mean of the Q-values with the standard deviation scaled by a coefficient dependent on $N$. +For OOD actions, the standard deviation will be higher, and thus the penalty will be stronger, preventing divergence. + +Original paper: + +* [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble](https://arxiv.org/abs/2110.01548) + +Reference resources: + +* :material-github: [Official codebase for SAC-N and EDAC](https://github.com/snu-mllab/EDAC) + + +!!! success + SAC-N is extremely simple extension of online SAC and works quite well out of box on majority of the benchmarks. + Usually only one parameter needs tuning - the size of the critics ensemble. It has SOTA results on the D4RL-Mujoco domain. + +!!! warning + Typically, SAC-N requires more time to converge, 3M updates instead of the usual 1M. Also, more complex tasks + may require a larger ensemble size, which will considerably increase training time. Finally, + SAC-N mysteriously does not work on the AntMaze domain. If you know how to fix this, let us know, it would be awesome! + + +Possible extensions: + +* [Anti-Exploration by Random Network Distillation](https://arxiv.org/abs/2301.13616) +* [Why So Pessimistic? Estimating Uncertainties for Offline RL through Ensembles, and Why Their Independence Matters](https://arxiv.org/abs/2205.13703) + +We'd be glad if someone would be interested in contributing them! + +## Implemented Variants + +| Variants Implemented | Description | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------| +| :material-github:[`offline/sac_n.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/sac_n.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/sac_n) | For continuous action spaces and offline RL without fine-tuning support. | + + +## Explanation of logged metrics + +* `critic_loss`: sum of the Q-ensemble individual mean losses (for loss definition see above) +* `actor_loss`: mean actor loss (for loss definition see above) +* `alpha_loss`: entropy regularization coefficient loss for automatic policy entropy tuning (see **CleanRL** docs for more details) +* `batch_entropy`: estimation of the policy distribution entropy based on the batch states +* `alpha`: coefficient for entropy regularization of the policy +* `q_policy_std`: standard deviation of the Q-ensemble on batch of states and policy actions +* `q_random_std`: standard deviation of the Q-ensemble on batch of states and random (OOD) actions +* `eval/reward_mean`: mean undiscounted evaluation return +* `eval/reward_std`: standard deviation of the undiscounted evaluation return across `config.eval_episodes` episodes +* `eval/normalized_score_mean`: mean evaluation normalized score. Should be between 0 and 100, where 100+ is the + performance above expert for this environment. Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)]. +* `eval/normalized_score_std`: standard deviation of the evaluation normalized score across `config.eval_episodes` episodes + +## Implementation details + +1. Efficient ensemble implementation with vectorized linear layers (:material-github:[algorithms/offline/sac_n.py#L174](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L174)) +2. Actor last layer initialization with small values (:material-github:[algorithms/offline/sac_n.py#L223](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L223)) +3. Critic last layer initialization with small values (but bigger than in actor) (:material-github:[algorithms/offline/sac_n.py#L283](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L283)) +4. Clipping bounds for actor `log_std` are different from original the online SAC (:material-github:[algorithms/offline/sac_n.py#L241](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L241)) + +## Experimental results + +For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md). +Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly. + + + +## Training options + +```commandline +usage: sac_n.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--hidden_dim int] [--num_critics int] + [--gamma float] [--tau float] [--actor_learning_rate float] [--critic_learning_rate float] + [--alpha_learning_rate float] [--max_action float] [--buffer_size int] [--env_name str] [--batch_size int] + [--num_epochs int] [--num_updates_on_epoch int] [--normalize_reward bool] [--eval_episodes int] + [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool] [--train_seed int] + [--eval_seed int] [--log_every int] [--device str] + +optional arguments: + -h, --help show this help message and exit + --config_path str Path for a config file to parse with pyrallis (default: None) + +TrainConfig: + + --project str wandb project name (default: CORL) + --group str wandb group name (default: SAC-N) + --name str wandb run name (default: SAC-N) + --hidden_dim int actor and critic hidden dim (default: 256) + --num_critics int critic ensemble size (default: 10) + --gamma float discount factor (default: 0.99) + --tau float coefficient for the target critic Polyak's update (default: 0.005) + --actor_learning_rate float + actor learning rate (default: 0.0003) + --critic_learning_rate float + critic learning rate (default: 0.0003) + --alpha_learning_rate float + entropy coefficient learning rate for automatic tuning (default: 0.0003) + --max_action float maximum range for the symmetric actions, [-1, 1] (default: 1.0) + --buffer_size int maximum size of the replay buffer (default: 1000000) + --env_name str training dataset and evaluation environment (default: halfcheetah-medium-v2) + --batch_size int training batch size (default: 256) + --num_epochs int total number of training epochs (default: 3000) + --num_updates_on_epoch int + number of gradient updates during one epoch (default: 1000) + --normalize_reward bool + whether to normalize reward (like in IQL) (default: False) + --eval_episodes int number of episodes to run during evaluation (default: 10) + --eval_every int evaluation frequency, will evaluate eval_every training steps (default: 5) + --checkpoints_path [str] + path for checkpoints saving, optional (default: None) + --deterministic_torch bool + configure PyTorch to use deterministic algorithms instead of nondeterministic ones (default: False) + --train_seed int training random seed (default: 10) + --eval_seed int evaluation random seed (default: 42) + --log_every int frequency of metrics logging to the wandb (default: 100) + --device str training device (default: cpu) +``` + diff --git a/docs/algorithms/spot.md b/docs/algorithms/spot.md new file mode 100644 index 00000000..bc991a94 --- /dev/null +++ b/docs/algorithms/spot.md @@ -0,0 +1 @@ +# SPOT \ No newline at end of file diff --git a/docs/algorithms/td3-bc.md b/docs/algorithms/td3-bc.md new file mode 100644 index 00000000..8199ab8f --- /dev/null +++ b/docs/algorithms/td3-bc.md @@ -0,0 +1 @@ +# TD3+BC \ No newline at end of file diff --git a/docs/assets/corl.pdf b/docs/assets/corl.pdf new file mode 100644 index 00000000..d4eb872c Binary files /dev/null and b/docs/assets/corl.pdf differ diff --git a/docs/assets/logo.jpeg b/docs/assets/logo.jpeg new file mode 100644 index 00000000..6fc0a734 Binary files /dev/null and b/docs/assets/logo.jpeg differ diff --git a/docs/assets/perf_profiles_offline.pdf b/docs/assets/perf_profiles_offline.pdf new file mode 100644 index 00000000..33c2cb0e Binary files /dev/null and b/docs/assets/perf_profiles_offline.pdf differ diff --git a/docs/assets/perf_profiles_online.pdf b/docs/assets/perf_profiles_online.pdf new file mode 100644 index 00000000..c59684ac Binary files /dev/null and b/docs/assets/perf_profiles_online.pdf differ diff --git a/docs/benchmarks/offline-to-online.md b/docs/benchmarks/offline-to-online.md new file mode 100644 index 00000000..32f9e4d4 --- /dev/null +++ b/docs/benchmarks/offline-to-online.md @@ -0,0 +1,60 @@ +--- +hide: + - toc # Hide table of contents +--- + +# Offline-to-online + + Here, we report reproduced scores after offline pretraining and online fine-tuning for all datasets and offline-to-online algorithms considered. + +!!! tip + If you want to re-collect our results in a more structured/nuanced manner, see [how to reproduce](repro.md) section. + +## Scores + +### Antmaze +| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL| +|--------------------------|------------|--------|--------|-----|-----| +|antmaze-umaze-v2|52.75 ± 8.67 → 98.75 ± 1.09|94.00 ± 1.58 → 99.50 ± 0.87|77.00 ± 0.71 → 96.50 ± 1.12|91.00 ± 2.55 → 99.50 ± 0.50|76.75 ± 7.53 → 99.75 ± 0.43| +|antmaze-umaze-diverse-v2|56.00 ± 2.74 → 0.00 ± 0.00|9.50 ± 9.91 → 99.00 ± 1.22|59.50 ± 9.55 → 63.75 ± 25.02|36.25 ± 2.17 → 95.00 ± 3.67|32.00 ± 27.79 → 98.50 ± 1.12| +|antmaze-medium-play-v2|0.00 ± 0.00 → 0.00 ± 0.00|59.00 ± 11.18 → 97.75 ± 1.30|71.75 ± 2.95 → 89.75 ± 1.09|67.25 ± 10.47 → 97.25 ± 1.30|71.75 ± 3.27 → 98.75 ± 1.64| +|antmaze-medium-diverse-v2|0.00 ± 0.00 → 0.00 ± 0.00|63.50 ± 6.84 → 97.25 ± 1.92|64.25 ± 1.92 → 92.25 ± 2.86|73.75 ± 7.29 → 94.50 ± 1.66|62.00 ± 4.30 → 98.25 ± 1.48| +|antmaze-large-play-v2|0.00 ± 0.00 → 0.00 ± 0.00|28.75 ± 7.76 → 88.25 ± 2.28|38.50 ± 8.73 → 64.50 ± 17.04|31.50 ± 12.58 → 87.00 ± 3.24|31.75 ± 8.87 → 97.25 ± 1.79| +|antmaze-large-diverse-v2|0.00 ± 0.00 → 0.00 ± 0.00|35.50 ± 3.64 → 91.75 ± 3.96|26.75 ± 3.77 → 64.25 ± 4.15|17.50 ± 7.26 → 81.00 ± 14.14|44.00 ± 8.69 → 91.50 ± 3.91| +| **average** |18.12 → 16.46|48.38 → 95.58|56.29 → 78.50|52.88 → 92.38|53.04 → 97.33| + +### Adroit +| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL| +|--------------------------|------------|--------|--------|-----|-----| +|pen-cloned-v1|88.66 ± 15.10 → 86.82 ± 11.12|-2.76 ± 0.08 → -1.28 ± 2.16|84.19 ± 3.96 → 102.02 ± 20.75|6.19 ± 5.21 → 43.63 ± 20.09|-2.66 ± 0.04 → -2.68 ± 0.12| +|door-cloned-v1|0.93 ± 1.66 → 0.01 ± 0.00|-0.33 ± 0.01 → -0.33 ± 0.01|1.19 ± 0.93 → 20.34 ± 9.32|-0.21 ± 0.14 → 0.02 ± 0.31|-0.33 ± 0.01 → -0.33 ± 0.01| +|hammer-cloned-v1|1.80 ± 3.01 → 0.24 ± 0.04|0.56 ± 0.55 → 2.85 ± 4.81|1.35 ± 0.32 → 57.27 ± 28.49|3.97 ± 6.39 → 3.73 ± 4.99|0.25 ± 0.04 → 0.17 ± 0.17| +|relocate-cloned-v1|-0.04 ± 0.04 → -0.04 ± 0.01|-0.33 ± 0.01 → -0.33 ± 0.01|0.04 ± 0.04 → 0.32 ± 0.38|-0.24 ± 0.01 → -0.15 ± 0.05|-0.31 ± 0.05 → -0.31 ± 0.04| +| **average** |22.84 → 21.76|-0.72 → 0.22|21.69 → 44.99|2.43 → 11.81|-0.76 → -0.79| + +## Regrets + +### Antmaze +| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL| +|--------------------------|------------|--------|--------|-----|-----| +|antmaze-umaze-v2|0.04 ± 0.01|0.02 ± 0.00|0.07 ± 0.00|0.02 ± 0.00|0.01 ± 0.00| +|antmaze-umaze-diverse-v2|0.88 ± 0.01|0.09 ± 0.01|0.43 ± 0.11|0.22 ± 0.07|0.05 ± 0.01| +|antmaze-medium-play-v2|1.00 ± 0.00|0.08 ± 0.01|0.09 ± 0.01|0.06 ± 0.00|0.04 ± 0.01| +|antmaze-medium-diverse-v2|1.00 ± 0.00|0.08 ± 0.00|0.10 ± 0.01|0.05 ± 0.01|0.04 ± 0.01| +|antmaze-large-play-v2|1.00 ± 0.00|0.21 ± 0.02|0.34 ± 0.05|0.29 ± 0.07|0.13 ± 0.02| +|antmaze-large-diverse-v2|1.00 ± 0.00|0.21 ± 0.03|0.41 ± 0.03|0.23 ± 0.08|0.13 ± 0.02| +| **average** |0.82|0.11|0.24|0.15|0.07| + +### Adroit + +| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL| +|--------------------------|------------|--------|--------|-----|-----| +|pen-cloned-v1|0.46 ± 0.02|0.97 ± 0.00|0.37 ± 0.01|0.58 ± 0.02|0.98 ± 0.01| +|door-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.83 ± 0.03|0.99 ± 0.01|1.00 ± 0.00| +|hammer-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.65 ± 0.10|0.98 ± 0.01|1.00 ± 0.00| +|relocate-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00| +| **average** |0.86|0.99|0.71|0.89|0.99| + +## Visual summary + +![](../assets/perf_profiles_online.pdf) \ No newline at end of file diff --git a/docs/benchmarks/offline.md b/docs/benchmarks/offline.md new file mode 100644 index 00000000..f83e0746 --- /dev/null +++ b/docs/benchmarks/offline.md @@ -0,0 +1,122 @@ +--- +hide: + - toc # Hide table of contents +--- + +# Offline + + Here, we report reproduced **final** and **best** scores for all datasets and offline algorithms considered. Note that they differ by a significant + margin, and some papers may use different approaches, not making it always explicit which reporting methodology they chose. + +!!! tip + If you want to re-collect our results in a more structured/nuanced manner, see [how to reproduce](repro.md) section. + +## Last Scores +### Gym-MuJoCo + +| **Task-Name**|BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N| EDAC |DT| +|------------------------------|------------|--------|--------|--------|-----|-----|------|-------|----------------|----| +|halfcheetah-medium-v2|42.40 ± 0.19|42.46 ± 0.70|48.10 ± 0.18|50.02 ± 0.27|47.04 ± 0.22|48.31 ± 0.22|64.04 ± 0.68|68.20 ± 1.28| 67.70 ± 1.04 |42.20 ± 0.26| +|halfcheetah-medium-replay-v2|35.66 ± 2.33|23.59 ± 6.95|44.84 ± 0.59|45.13 ± 0.88|45.04 ± 0.27|44.46 ± 0.22|51.18 ± 0.31|60.70 ± 1.01| 62.06 ± 1.10 |38.91 ± 0.50| +|halfcheetah-medium-expert-v2|55.95 ± 7.35|90.10 ± 2.45|90.78 ± 6.04|95.00 ± 0.61|95.63 ± 0.42|94.74 ± 0.52|103.80 ± 2.95|98.96 ± 9.31| 104.76 ± 0.64 |91.55 ± 0.95| +|hopper-medium-v2|53.51 ± 1.76|55.48 ± 7.30|60.37 ± 3.49|63.02 ± 4.56|59.08 ± 3.77|67.53 ± 3.78|102.29 ± 0.17|40.82 ± 9.91| 101.70 ± 0.28 |65.10 ± 1.61| +|hopper-medium-replay-v2|29.81 ± 2.07|70.42 ± 8.66|64.42 ± 21.52|98.88 ± 2.07|95.11 ± 5.27|97.43 ± 6.39|94.98 ± 6.53|100.33 ± 0.78| 99.66 ± 0.81 |81.77 ± 6.87| +|hopper-medium-expert-v2|52.30 ± 4.01|111.16 ± 1.03|101.17 ± 9.07|101.90 ± 6.22|99.26 ± 10.91|107.42 ± 7.80|109.45 ± 2.34|101.31 ± 11.63| 105.19 ± 10.08 |110.44 ± 0.33| +|walker2d-medium-v2|63.23 ± 16.24|67.34 ± 5.17|82.71 ± 4.78|68.52 ± 27.19|80.75 ± 3.28|80.91 ± 3.17|85.82 ± 0.77|87.47 ± 0.66| 93.36 ± 1.38 |67.63 ± 2.54| +|walker2d-medium-replay-v2|21.80 ± 10.15|54.35 ± 6.34|85.62 ± 4.01|80.62 ± 3.58|73.09 ± 13.22|82.15 ± 3.03|84.25 ± 2.25|78.99 ± 0.50| 87.10 ± 2.78 |59.86 ± 2.73| +|walker2d-medium-expert-v2|98.96 ± 15.98|108.70 ± 0.25|110.03 ± 0.36|111.44 ± 1.62|109.56 ± 0.39|111.72 ± 0.86|111.86 ± 0.43|114.93 ± 0.41| 114.75 ± 0.74 |107.11 ± 0.96| +| **locomotion average** |50.40|69.29|76.45|79.39|78.28|81.63|89.74|83.52| **92.92** |73.84| + +### Maze2d +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC| SAC-N |EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|----------------|------|----| +|maze2d-umaze-v1|0.36 ± 8.69|12.18 ± 4.29|29.41 ± 12.31|65.65 ± 5.34|-8.90 ± 6.11|42.11 ± 0.58|106.87 ± 22.16| 130.59 ± 16.52 |95.26 ± 6.39|18.08 ± 25.42| +|maze2d-medium-v1|0.79 ± 3.25|14.25 ± 2.33|59.45 ± 36.25|84.63 ± 35.54|86.11 ± 9.68|34.85 ± 2.72|105.11 ± 31.67| 88.61 ± 18.72 |57.04 ± 3.45|31.71 ± 26.33| +|maze2d-large-v1|2.26 ± 4.39|11.32 ± 5.10|97.10 ± 25.41|215.50 ± 3.11|23.75 ± 36.70|61.72 ± 3.50|78.33 ± 61.77| 204.76 ± 1.19 |95.60 ± 22.92|35.66 ± 28.20| +| **maze2d average** |1.13|12.58|61.99|121.92|33.65|46.23|96.77| **141.32** |82.64|28.48| + +### Antmaze +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----| +|antmaze-umaze-v2|55.25 ± 4.15|65.75 ± 5.26|70.75 ± 39.18|56.75 ± 9.09|92.75 ± 1.92|77.00 ± 5.52| 97.75 ± 1.48 |0.00 ± 0.00|0.00 ± 0.00|57.00 ± 9.82| +|antmaze-umaze-diverse-v2|47.25 ± 4.09|44.00 ± 1.00|44.75 ± 11.61|54.75 ± 8.01|37.25 ± 3.70|54.25 ± 5.54| 83.50 ± 7.02 |0.00 ± 0.00|0.00 ± 0.00|51.75 ± 0.43| +|antmaze-medium-play-v2|0.00 ± 0.00|2.00 ± 0.71|0.25 ± 0.43|0.00 ± 0.00|65.75 ± 11.61|65.75 ± 11.71| 89.50 ± 3.35 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-medium-diverse-v2|0.75 ± 0.83|5.75 ± 9.39|0.25 ± 0.43|0.00 ± 0.00|67.25 ± 3.56|73.75 ± 5.45| 83.50 ± 8.20 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-large-play-v2|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|20.75 ± 7.26|42.00 ± 4.53| 52.25 ± 29.01 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-large-diverse-v2|0.00 ± 0.00|0.75 ± 0.83|0.00 ± 0.00|0.00 ± 0.00|20.50 ± 13.24|30.25 ± 3.63| 64.00 ± 5.43 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +| **antmaze average** | 17.21|19.71|19.33|18.58|50.71|57.17| **78.42** |0.00|0.00|18.12| + +### Adroit +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----| +|pen-human-v1|71.03 ± 6.26|26.99 ± 9.60|-3.88 ± 0.21|76.65 ± 11.71|13.71 ± 16.98|78.49 ± 8.21| 103.16 ± 8.49 |6.86 ± 5.93|5.07 ± 6.16|67.68 ± 5.48| +|pen-cloned-v1|51.92 ± 15.15|46.67 ± 14.25|5.13 ± 5.28|85.72 ± 16.92|1.04 ± 6.62|83.42 ± 8.19| 102.79 ± 7.84 |31.35 ± 2.14|12.02 ± 1.75|64.43 ± 1.43| +|pen-expert-v1|109.65 ± 7.28|114.96 ± 2.96|122.53 ± 21.27|159.91 ± 1.87|-1.41 ± 2.34|128.05 ± 9.21| 152.16 ± 6.33 |87.11 ± 48.95|-1.55 ± 0.81|116.38 ± 1.27| +|door-human-v1|2.34 ± 4.00|-0.13 ± 0.07|-0.33 ± 0.01|2.39 ± 2.26|5.53 ± 1.31|3.26 ± 1.83| -0.10 ± 0.01 |-0.38 ± 0.00|-0.12 ± 0.13|4.44 ± 0.87| +|door-cloned-v1|-0.09 ± 0.03|0.29 ± 0.59|-0.34 ± 0.01|-0.01 ± 0.01|-0.33 ± 0.01|3.07 ± 1.75| 0.06 ± 0.05 |-0.33 ± 0.00|2.66 ± 2.31|7.64 ± 3.26| +|door-expert-v1|105.35 ± 0.09|104.04 ± 1.46|-0.33 ± 0.01|104.57 ± 0.31|-0.32 ± 0.02|106.65 ± 0.25| 106.37 ± 0.29 |-0.33 ± 0.00|106.29 ± 1.73|104.87 ± 0.39| +|hammer-human-v1|3.03 ± 3.39|-0.19 ± 0.02|1.02 ± 0.24|1.01 ± 0.51|0.14 ± 0.11|1.79 ± 0.80| 0.24 ± 0.24 |0.24 ± 0.00|0.28 ± 0.18|1.28 ± 0.15| +|hammer-cloned-v1|0.55 ± 0.16|0.12 ± 0.08|0.25 ± 0.01|1.27 ± 2.11|0.30 ± 0.01|1.50 ± 0.69| 5.00 ± 3.75 |0.14 ± 0.09|0.19 ± 0.07|1.82 ± 0.55| +|hammer-expert-v1|126.78 ± 0.64|121.75 ± 7.67|3.11 ± 0.03|127.08 ± 0.13|0.26 ± 0.01|128.68 ± 0.33| 133.62 ± 0.27 |25.13 ± 43.25|28.52 ± 49.00|117.45 ± 6.65| +|relocate-human-v1|0.04 ± 0.03|-0.14 ± 0.08|-0.29 ± 0.01|0.45 ± 0.53|0.06 ± 0.03|0.12 ± 0.04| 0.16 ± 0.30 |-0.31 ± 0.01|-0.17 ± 0.17|0.05 ± 0.01| +|relocate-cloned-v1|-0.06 ± 0.01|-0.00 ± 0.02|-0.30 ± 0.01|-0.01 ± 0.03|-0.29 ± 0.01|0.04 ± 0.01| 1.66 ± 2.59 |-0.01 ± 0.10|0.17 ± 0.35|0.16 ± 0.09| +|relocate-expert-v1|107.58 ± 1.20|97.90 ± 5.21|-1.73 ± 0.96|109.52 ± 0.47|-0.30 ± 0.02|106.11 ± 4.02| 107.52 ± 2.28 |-0.36 ± 0.00|71.94 ± 18.37|104.28 ± 0.42| +| | | | | | | | | | | | +| **adroit average** | 48.18|42.69|10.40|55.71|1.53|53.43| **59.39** |12.43|18.78|49.21| + +## Best Scores +### Gym-MuJoCo +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N| EDAC |DT| +|--------------------|------------|--------|--------|--------|-----|-----|------|-------|---------------|----| +|halfcheetah-medium-v2|43.60 ± 0.14|43.90 ± 0.13|48.93 ± 0.11|50.81 ± 0.15|47.62 ± 0.03|48.84 ± 0.07|65.62 ± 0.46|72.21 ± 0.31| 69.72 ± 0.92 |42.73 ± 0.10| +|halfcheetah-medium-replay-v2|40.52 ± 0.19|42.27 ± 0.46|45.84 ± 0.26|46.47 ± 0.26|46.43 ± 0.19|45.35 ± 0.08|52.22 ± 0.31|67.29 ± 0.34| 66.55 ± 1.05 |40.31 ± 0.28| +|halfcheetah-medium-expert-v2|79.69 ± 3.10|94.11 ± 0.22|96.59 ± 0.87|96.83 ± 0.23|97.04 ± 0.17|95.38 ± 0.17|108.89 ± 1.20|111.73 ± 0.47| 110.62 ± 1.04 |93.40 ± 0.21| +|hopper-medium-v2|69.04 ± 2.90|73.84 ± 0.37|70.44 ± 1.18|95.42 ± 3.67|70.80 ± 1.98|80.46 ± 3.09|103.19 ± 0.16|101.79 ± 0.20| 103.26 ± 0.14 |69.42 ± 3.64| +|hopper-medium-replay-v2|68.88 ± 10.33|90.57 ± 2.07|98.12 ± 1.16|101.47 ± 0.23|101.63 ± 0.55|102.69 ± 0.96|102.57 ± 0.45|103.83 ± 0.53| 103.28 ± 0.49 |88.74 ± 3.02| +|hopper-medium-expert-v2|90.63 ± 10.98|113.13 ± 0.16|113.22 ± 0.43|113.26 ± 0.49|112.84 ± 0.66|113.18 ± 0.38|113.16 ± 0.43|111.24 ± 0.15| 111.80 ± 0.11 |111.18 ± 0.21| +|walker2d-medium-v2|80.64 ± 0.91|82.05 ± 0.93|86.91 ± 0.28|85.86 ± 3.76|84.77 ± 0.20|87.58 ± 0.48|87.79 ± 0.19|90.17 ± 0.54| 95.78 ± 1.07 |74.70 ± 0.56| +|walker2d-medium-replay-v2|48.41 ± 7.61|76.09 ± 0.40|91.17 ± 0.72|86.70 ± 0.94|89.39 ± 0.88|89.94 ± 0.93|91.11 ± 0.63|85.18 ± 1.63| 89.69 ± 1.39 |68.22 ± 1.20| +|walker2d-medium-expert-v2|109.95 ± 0.62|109.90 ± 0.09|112.21 ± 0.06|113.40 ± 2.22|111.63 ± 0.38|113.06 ± 0.53|112.49 ± 0.18|116.93 ± 0.42| 116.52 ± 0.75 |108.71 ± 0.34| +| **locomotion average** | 70.15|80.65|84.83|87.80|84.68|86.28|93.00|95.60| **96.36** |77.49| + + +### Maze2d +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----| +|maze2d-umaze-v1|16.09 ± 0.87|22.49 ± 1.52|99.33 ± 16.16|136.96 ± 10.89|92.05 ± 13.66|50.92 ± 4.23| 162.28 ± 1.79 |153.12 ± 6.49|149.88 ± 1.97|63.83 ± 17.35| +|maze2d-medium-v1|19.16 ± 1.24|27.64 ± 1.87|150.93 ± 3.89|152.73 ± 20.78|128.66 ± 5.44|122.69 ± 30.00| 150.12 ± 4.48 |93.80 ± 14.66|154.41 ± 1.58|68.14 ± 12.25| +|maze2d-large-v1|20.75 ± 6.66|41.83 ± 3.64|197.64 ± 5.26|227.31 ± 1.47|157.51 ± 7.32|162.25 ± 44.18| 197.55 ± 5.82 |207.51 ± 0.96|182.52 ± 2.68|50.25 ± 19.34| +| | | | | | | | | | | | +| **maze2d average** | 18.67|30.65|149.30|172.33|126.07|111.95| **169.98** |151.48|162.27|60.74| + +### Antmaze +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----| +|antmaze-umaze-v2|68.50 ± 2.29|77.50 ± 1.50|98.50 ± 0.87|70.75 ± 8.84|94.75 ± 0.83|84.00 ± 4.06| 100.00 ± 0.00 |0.00 ± 0.00|42.50 ± 28.61|64.50 ± 2.06| +|antmaze-umaze-diverse-v2|64.75 ± 4.32|63.50 ± 2.18|71.25 ± 5.76|81.50 ± 4.27|53.75 ± 2.05|79.50 ± 3.35| 96.75 ± 2.28 |0.00 ± 0.00|0.00 ± 0.00|60.50 ± 2.29| +|antmaze-medium-play-v2|4.50 ± 1.12|6.25 ± 2.38|3.75 ± 1.30|25.00 ± 10.70|80.50 ± 3.35|78.50 ± 3.84| 93.50 ± 2.60 |0.00 ± 0.00|0.00 ± 0.00|0.75 ± 0.43| +|antmaze-medium-diverse-v2|4.75 ± 1.09|16.50 ± 5.59|5.50 ± 1.50|10.75 ± 5.31|71.00 ± 4.53|83.50 ± 1.80| 91.75 ± 2.05 |0.00 ± 0.00|0.00 ± 0.00|0.50 ± 0.50| +|antmaze-large-play-v2|0.50 ± 0.50|13.50 ± 9.76|1.25 ± 0.43|0.50 ± 0.50|34.75 ± 5.85|53.50 ± 2.50| 68.75 ± 13.90 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +|antmaze-large-diverse-v2|0.75 ± 0.43|6.25 ± 1.79|0.25 ± 0.43|0.00 ± 0.00|36.25 ± 3.34|53.00 ± 3.00| 69.50 ± 7.26 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00| +| **antmaze average** |23.96|30.58|30.08|31.42|61.83|72.00| **86.71** |0.00|7.08|21.04| + +### Adroit +| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT| +|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----| +|pen-human-v1|99.69 ± 7.45|59.89 ± 8.03|9.95 ± 8.19|119.03 ± 6.55|58.91 ± 1.81|106.15 ± 10.28| 127.28 ± 3.22 |56.48 ± 7.17|35.84 ± 10.57|77.83 ± 2.30| +|pen-cloned-v1|99.14 ± 12.27|83.62 ± 11.75|52.66 ± 6.33|125.78 ± 3.28|14.74 ± 2.31|114.05 ± 4.78| 128.64 ± 7.15 |52.69 ± 5.30|26.90 ± 7.85|71.17 ± 2.70| +|pen-expert-v1|128.77 ± 5.88|134.36 ± 3.16|142.83 ± 7.72|162.53 ± 0.30|14.86 ± 4.07|140.01 ± 6.36| 157.62 ± 0.26 |116.43 ± 40.26|36.04 ± 4.60|119.49 ± 2.31| +|door-human-v1|9.41 ± 4.55|7.00 ± 6.77|-0.11 ± 0.06|17.70 ± 2.55|13.28 ± 2.77|13.52 ± 1.22| 0.27 ± 0.43 |-0.10 ± 0.06|2.51 ± 2.26|7.36 ± 1.24| +|door-cloned-v1|3.40 ± 0.95|10.37 ± 4.09|-0.20 ± 0.11|10.53 ± 2.82|-0.08 ± 0.13|9.02 ± 1.47| 7.73 ± 6.80 |-0.21 ± 0.10|20.36 ± 1.11|11.18 ± 0.96| +|door-expert-v1|105.84 ± 0.23|105.92 ± 0.24|4.49 ± 7.39|106.60 ± 0.27|59.47 ± 25.04|107.29 ± 0.37| 106.78 ± 0.04 |0.05 ± 0.02|109.22 ± 0.24|105.49 ± 0.09| +|hammer-human-v1|12.61 ± 4.87|6.23 ± 4.79|2.38 ± 0.14|16.95 ± 3.61|0.30 ± 0.05|6.86 ± 2.38| 1.18 ± 0.15 |0.25 ± 0.00|3.49 ± 2.17|1.68 ± 0.11| +|hammer-cloned-v1|8.90 ± 4.04|8.72 ± 3.28|0.96 ± 0.30|10.74 ± 5.54|0.32 ± 0.03|11.63 ± 1.70| 48.16 ± 6.20 |12.67 ± 15.02|0.27 ± 0.01|2.74 ± 0.22| +|hammer-expert-v1|127.89 ± 0.57|128.15 ± 0.66|33.31 ± 47.65|129.08 ± 0.26|0.93 ± 1.12|129.76 ± 0.37| 134.74 ± 0.30 |91.74 ± 47.77|69.44 ± 47.00|127.39 ± 0.10| +|relocate-human-v1|0.59 ± 0.27|0.16 ± 0.14|-0.29 ± 0.01|1.77 ± 0.84|1.03 ± 0.20|1.22 ± 0.28| 3.70 ± 2.34 |-0.18 ± 0.14|0.05 ± 0.02|0.08 ± 0.02| +|relocate-cloned-v1|0.45 ± 0.31|0.74 ± 0.45|-0.02 ± 0.04|0.39 ± 0.13|-0.07 ± 0.02|1.78 ± 0.70| 9.25 ± 2.56 |0.10 ± 0.04|4.11 ± 1.39|0.34 ± 0.09| +|relocate-expert-v1|110.31 ± 0.36|109.77 ± 0.60|0.23 ± 0.27|111.21 ± 0.32|0.03 ± 0.10|110.12 ± 0.82| 111.14 ± 0.23 |-0.07 ± 0.08|98.32 ± 3.75|106.49 ± 0.30| +| **adroit average** | 58.92|54.58|20.51|67.69|13.65|62.62| **69.71** |27.49|33.88|52.60| + +## Visual summary + +![](../assets/perf_profiles_offline.pdf) \ No newline at end of file diff --git a/docs/benchmarks/repro.md b/docs/benchmarks/repro.md new file mode 100644 index 00000000..b90758fa --- /dev/null +++ b/docs/benchmarks/repro.md @@ -0,0 +1,30 @@ +# How to Reproduce + +To reproduce all figures and tables from our [technical paper](https://arxiv.org/abs/2210.07105), do the following steps. + +## Collect wandb logs + +These scripts collect all wandb logs into .csv files and save them into the `runs_tables` folder. +We provide the tables, but you can recollect them. +```python +python results/get_offline_urls.py +python results/get_finetune_urls.py +``` + +## Collect scores + +These scripts collect data from runs kept in .csv files and save evaluation scores (and regret in case of offline-to-online) +into pickled files, which are stored in the `bin` folder. We provide the pickled data, but if you need to extract more data, +you can modify scripts for your purposes. +```python +python results/get_offline_scores.py +python results/get_finetune_scores.py +``` + +## Print tables + + These scripts use pickled data, print all the tables, and save all figures into the `out` directory. +```python +python results/get_offline_tables_and_plots.py +python results/get_finetune_tables_and_plots.py +``` \ No newline at end of file diff --git a/docs/community/contrib.md b/docs/community/contrib.md new file mode 100644 index 00000000..1f91155b --- /dev/null +++ b/docs/community/contrib.md @@ -0,0 +1,116 @@ +# Contribution + +## Contributing to the codebase + +We welcome: + +- Bug reports +- Pull requests for bug fixes +- Logs and documentation improvements +- New algorithms and datasets +- Better hyperparameters (but with proofs) + +### Setup + +Contributing code is done through standard github methods: + +1. Fork this repo +2. Make a change and commit your code +3. Submit a pull request. It will be reviewed by maintainers, and they'll give feedback or make requests as applicable + +```commandline +git clone git@github.com:tinkoff-ai/CORL.git +cd CORL +pip install -r requirements/requirements_dev.txt +``` + +For dependencies installation see [get started section](../get-started/install.md). + +### Code style + +The CI will run several checks on the new code pushed to the CORL repository. +These checks can also be run locally without waiting for the CI by following the steps below: + + +1. [install `pre-commit`](https://pre-commit.com/#install), +2. install the Git hooks by running `pre-commit install`. + +Once those two steps are done, the Git hooks will be run automatically at every new commit. +The Git hooks can also be run manually with `pre-commit run --all-files`, and +if needed they can be skipped (not recommended) with `git commit --no-verify`. + +We use [Ruff](https://github.com/astral-sh/ruff) as our main linter. If you want to see possible +problems before pre-commit, you can run `ruff check --diff .` to see exact linter suggestions and future fixes. + +## Adding new algorithms + +!!! warning + While we welcome any algorithms, it is better to open an issue with the proposal before + so we can discuss the details. Unfortunately, not all algorithms are equally + easy to understand and reproduce. We may be able to give a couple of advices to you, + or on the contrary warn you that this particular algorithm will require too much + computational resources to fully reproduce the results, and it is better to do something else. + + +All new algorithms should go to the `algorithms/contrib/offline` for just +offline algorithms and to the `algorithms/contrib/finetune` for the offline-to-online algorithms. + +We as a team try to keep the core as reliable and reproducible as possible, +but we may not have the resources to support all future algorithms. +Therefore, this separation is necessary, as we cannot guarantee that all +algorithms from `algorithms/contrib` exactly reproduce the results of their original publications. + +Make sure your new code is properly documented and all references to the original implementations and papers are present +(for example as in [Decision Transformer](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)). +Follow the conventions for naming argument of configs, functions, classes. Try to stylistically imitate already existing implementations. + +Please, **explain all the tricks and possible differences from the original implementation in as much detail as possible**. +Keep in mind that this code may be used by other researchers. Make their lives easier! + +### Running benchmarks + +Although you will have to do a hyperparameter search while reproducing the algorithm, +in the end we expect to see final configs in `configs/contrib///.yaml` with the best hyperparameters for all +datasets considered. The configs should be in `yaml` format, containing all hyperparameters sorted +in alphabetical order (see existing configs for an inspiration). + +Use these conventions to name your runs in the configs: +1. `name: ` +2. `group: --multiseed-v0`, increment version if needed +3. use our [\_\_post_init\_\_](https://github.com/tinkoff-ai/CORL/blob/962688b405f579a1ce6ec1b57e6369aaf76f9e69/algorithms/offline/awac.py#L48) implementation in your config dataclass + +Since we are releasing wandb logs for all algorithms, you will need to submit multiseed (~4 seeds) +training runs the `CORL` project in the wandb [corl-team](https://wandb.ai/corl-team) organization. We'll invite you there when the time will come. + +We usually use wandb sweeps for this. You can use this example config (it will work with pyrallis as it expects `config_path` cli argument): +```yaml title="sweep_config.yaml" +entity: corl-team +project: CORL +program: algorithms/contrib/.py +method: grid +parameters: + config_path: + # algo_type is offline or finetune (see sections above) + values: [ + "configs/contrib///.yaml", + "configs/contrib///.yaml", + "configs/contrib///.yaml", + ] + train_seed: + values: [0, 1, 2, 3] +``` +Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent `. + +Based on the results, you will need to make wandb reports to make it easier for other users to understand. +You can use any of the already existing ones as an example (see [README.md](https://github.com/corl-team/CORL/tree/main)). + +### Checklist + +Ideally, all checks should be completed! + +- [ ] Issue about new algorithm is open +- [ ] Single-file implementation is added to the `algorithms/contrib` +- [ ] PR has passed all the tests +- [ ] Evidence that implementation reproduces original results is provided +- [ ] Configs with the best hyperparameters for all datasets are added to the `configs/contrib` +- [ ] Logs and reports for best hyperparameters are submitted to our wandb organization diff --git a/docs/community/publications.md b/docs/community/publications.md new file mode 100644 index 00000000..cabc4960 --- /dev/null +++ b/docs/community/publications.md @@ -0,0 +1,28 @@ +# List of Publications + +!!! tip + Please open a pull request to add missing entries! + +List of publications that are using CORL algorithms or benchmarked results: + +- Lu, C., Ball, P. J., & Parker-Holder, J. Synthetic Experience Replay. +- Beeson, A., & Montana, G. (2023). Balancing policy constraint and ensemble size in uncertainty-based offline reinforcement learning. arXiv preprint arXiv:2303.14716. +- Nikulin, A., Kurenkov, V., Tarasov, D., & Kolesnikov, S. (2023). Anti-exploration by random network distillation. arXiv preprint arXiv:2301.13616. +- Bhargava, P., Chitnis, R., Geramifard, A., Sodhani, S., & Zhang, A. (2023). Sequence Modeling is a Robust Contender for Offline Reinforcement Learning. arXiv preprint arXiv:2305.14550. +- Hu, X., Ma, Y., Xiao, C., Zheng, Y., & Meng, Z. (2023). In-Sample Policy Iteration for Offline Reinforcement Learning. arXiv preprint arXiv:2306.05726. +- Lian, S., Ma, Y., Liu, J., Zheng, Y., & Meng, Z. (2023). HIPODE: Enhancing Offline Reinforcement Learning with High-Quality Synthetic Data from a Policy-Decoupled Approach. arXiv preprint arXiv:2306.06329. +- He, H., Bai, C., Xu, K., Yang, Z., Zhang, W., Wang, D., ... & Li, X. (2023). Diffusion Model is an Effective Planner and Data Synthesizer for Multi-Task Reinforcement Learning. arXiv preprint arXiv:2305.18459. +- Liu, J., Ma, Y., Hao, J., Hu, Y., Zheng, Y., Lv, T., & Fan, C. (2023). Prioritized Trajectory Replay: A Replay Memory for Data-driven Reinforcement Learning. arXiv preprint arXiv:2306.15503. +- Chitnis, R., Xu, Y., Hashemi, B., Lehnert, L., Dogan, U., Zhu, Z., & Delalleau, O. (2023). IQL-TD-MPC: Implicit Q-Learning for Hierarchical Model Predictive Control. arXiv preprint arXiv:2306.00867. +- Kurenkov, V., Nikulin, A., Tarasov, D., & Kolesnikov, S. (2023). Katakomba: Tools and Benchmarks for Data-Driven NetHack. arXiv preprint arXiv:2306.08772. +- Lian, S., Ma, Y., Liu, J., Jianye, H. A. O., Zheng, Y., & Meng, Z. (2023, July). A Policy-Decoupled Method for High-Quality Data Augmentation in Offline Reinforcement Learning. In ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems. + + + + + + + + + + \ No newline at end of file diff --git a/docs/get-started/install.md b/docs/get-started/install.md new file mode 100644 index 00000000..3b788d3d --- /dev/null +++ b/docs/get-started/install.md @@ -0,0 +1,38 @@ +# Installation + +## Manual +!!! warning + Unfortunately, installing all dependencies can cause some difficulties at the moment, mainly due to **D4RL** and + the old version of mujoco it is locked to. It will be much easier in the future after migration to the **Minari** is done. + +All necessary dependencies are specified in the [`requirements/requirements.txt`](https://github.com/corl-team/CORL/blob/main/requirements/requirements.txt) file. +You can just clone the repo and install all dependencies with pip: +```commandline +git clone https://github.com/corl-team/CORL.git +cd CORL +pip install -r requirements/requirements.txt +``` + +In addition to those specified there, the dependencies required by D4RL, namely MuJoCo binaries, must also be installed. +We recommend following the official guide from [**mujoco-py**](https://github.com/openai/mujoco-py). You will need to download +MuJoCo 2.1 binaries and extract downloaded `mujoco210` directory to the `~/.mujoco/mujoco210`: +```commandline +mkdir -p ~/.mujoco \ + && wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz \ + && tar -xf mujoco.tar.gz -C ~/.mujoco \ + && rm mujoco.tar.gz +export LD_LIBRARY_PATH=~/.mujoco/mujoco210/bin:${LD_LIBRARY_PATH} +``` +If you have any problems with the installation, we advise you to first look for similar issues in the +original [**D4RL**](https://github.com/Farama-Foundation/D4RL) and [**mujoco-py**](https://github.com/openai/mujoco-py) repositories. +Most likely problem is in **D4RL**, not in **CORL** :smile: + +## Docker + +To simplify installation and improve reproducibility, we provide a preconfigured +[Dockerfile](https://github.com/corl-team/CORL/blob/main/Dockerfile) that you can use: +```bash +cd CORL +docker build -t corl . +docker run --gpus=all -it --rm --name corl-container corl +``` \ No newline at end of file diff --git a/docs/get-started/usage.md b/docs/get-started/usage.md new file mode 100644 index 00000000..e65b71c0 --- /dev/null +++ b/docs/get-started/usage.md @@ -0,0 +1,157 @@ +# Basic Usage + +![corl_tldr](../assets/corl.pdf) + +## How to Train + +We use [pyrallis](https://github.com/eladrich/pyrallis) for the configuration, thus after the dependencies have been installed, +there are two ways to run the CORL algorithms: + +1. Manually specifying all the arguments within the terminal (they will overwrite the default ones): +```commandline +python algorithms/offline/dt.py \ + --project="CORL-Test" \ + --group="DT-Test" \ + --name="dt-testing-run" \ + --env_name="halfcheetah-medium-v2" \ + --device="cuda:0" + # etc... +``` + +2. With yaml config. First, create yaml file with all needed hyperparameters: +```yaml title="dt_example_config.yaml" +# taken from https://github.com/corl-team/CORL/blob/main/configs/offline/dt/halfcheetah/medium_v2.yaml +attention_dropout: 0.1 +batch_size: 4096 +betas: +- 0.9 +- 0.999 +checkpoints_path: null +clip_grad: 0.25 +deterministic_torch: false +device: cuda +embedding_dim: 128 +embedding_dropout: 0.1 +env_name: "halfcheetah-medium-v2" +episode_len: 1000 +eval_episodes: 100 +eval_every: 5000 +eval_seed: 42 +group: "dt-halfcheetah-medium-v2-multiseed-v2" +learning_rate: 0.0008 +max_action: 1.0 +name: "DT" +num_heads: 1 +num_layers: 3 +num_workers: 4 +project: "CORL" +residual_dropout: 0.1 +reward_scale: 0.001 +seq_len: 20 +target_returns: [12000.0, 6000.0] +train_seed: 10 +update_steps: 100000 +warmup_steps: 10000 +weight_decay: 0.0001 +``` +After that we can supply all hyperparameters from config with `config_path` argument: +```commandline +python algorithms/offline/dt.py \ + --config_path="dt_example_config.yaml" + # you can also overwrite any hyperparameter if needed + --device="cuda:0" + # etc... +``` +By default, training script will log metrics to the wandb project specified by the `group` argument. +If you want to disable logging, run `wandb disabled` or `wandb offline`. To turn it back on, run `wandb online`. +For more options see [wandb documentation](https://docs.wandb.ai/guides/technical-faq/general#can-i-disable-wandb-when-testing-my-code). + + If you're not familiar with [Weights & Biases](https://wandb.ai/site) logging tools, it is better to first familiarize + yourself with the basics [here](https://docs.wandb.ai/quickstart). + + For an explanation of all logged metrics, refer to the documentation of the specific algorithm. + +## CLI Documentation + +How to find out all available hyperparameters and their brief explanation? Very simple, just run `python algorithms/offline/dt.py --help` (this will work for all algorithms): +```commandline +usage: dt.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--embedding_dim int] [--num_layers int] + [--num_heads int] [--seq_len int] [--episode_len int] [--attention_dropout float] [--residual_dropout float] + [--embedding_dropout float] [--max_action float] [--env_name str] [--learning_rate float] + [--betas float float] [--weight_decay float] [--clip_grad [float]] [--batch_size int] [--update_steps int] + [--warmup_steps int] [--reward_scale float] [--num_workers int] [--target_returns float [float, ...]] + [--eval_episodes int] [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool] + [--train_seed int] [--eval_seed int] [--device str] + +optional arguments: + -h, --help show this help message and exit + --config_path str Path for a config file to parse with pyrallis (default: None) + +TrainConfig: + + --project str wandb project name (default: CORL) + --group str wandb group name (default: DT-D4RL) + --name str wandb run name (default: DT) + --embedding_dim int transformer hidden dim (default: 128) + --num_layers int depth of the transformer model (default: 3) + --num_heads int number of heads in the attention (default: 1) + --seq_len int maximum sequence length during training (default: 20) + --episode_len int maximum rollout length, needed for the positional embeddings (default: 1000) + --attention_dropout float + attention dropout (default: 0.1) + --residual_dropout float + residual dropout (default: 0.1) + --embedding_dropout float + embeddings dropout (default: 0.1) + --max_action float maximum range for the symmetric actions, [-1, 1] (default: 1.0) + --env_name str training dataset and evaluation environment (default: halfcheetah-medium-v2) + --learning_rate float + AdamW optimizer learning rate (default: 0.0001) + --betas float float AdamW optimizer betas (default: (0.9, 0.999)) + --weight_decay float AdamW weight decay (default: 0.0001) + --clip_grad [float] maximum gradient norm during training, optional (default: 0.25) + --batch_size int training batch size (default: 64) + --update_steps int total training steps (default: 100000) + --warmup_steps int warmup steps for the learning rate scheduler (increasing from zero to learning_rate) (default: + 10000) + --reward_scale float reward scaling, to reduce the magnitude (default: 0.001) + --num_workers int number of workers for the pytorch dataloader (default: 4) + --target_returns float [float, ...] + target return-to-go for the prompting durint evaluation (default: (12000.0, 6000.0)) + --eval_episodes int number of episodes to run during evaluation (default: 100) + --eval_every int evaluation frequency, will evaluate eval_every training steps (default: 10000) + --checkpoints_path [str] + path for checkpoints saving, optional (default: None) + --deterministic_torch bool + configure PyTorch to use deterministic algorithms instead of nondeterministic ones where available + (default: False) + --train_seed int training random seed (default: 10) + --eval_seed int evaluation random seed (default: 42) + --device str training device (default: cuda) +``` + +## Benchmarking + +Sooner or later you will probably want to run many experiments at once, for example to search for hyperparameters, +or to do multi-seed training for some datasets. For something like this we recommend using wandb sweeps (and we use them ourselves). +The general recipe looks like this. First, create wandb seep config: +```yaml title="sweep_config.yaml" +entity: corl-team +project: CORL +program: algorithms/offline/dt.py +method: grid +parameters: + # specify all configs to run for the choosen algorithm + config_path: + values: [ + "configs/offline/dt/halfcheetah/medium_v2.yaml", + "configs/offline/dt/halfcheetah/medium_replay_v2.yaml", + "configs/offline/dt/halfcheetah/medium_expert_v2.yaml", + ] + train_seed: + values: [0, 1, 2, 3] +``` +Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent `. +This will train multiple seeds for each config. + +All configs with full hyperparameters for all datasets and algorithms are in [`configs`](https://github.com/corl-team/CORL/tree/main/configs). diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..cb337970 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,77 @@ +--- +hide: + - toc # Hide table of contents +--- + +# CORL (Clean Offline Reinforcement Learning) + +[![Twitter](https://badgen.net/badge/icon/twitter?icon=twitter&label)](https://twitter.com/vladkurenkov/status/1669361090550177793) +[![arXiv](https://img.shields.io/badge/arXiv-2210.07105-b31b1b.svg)](https://arxiv.org/abs/2210.07105) +[](https://github.com/tinkoff-ai/CORL/blob/main/LICENSE) +[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) + +🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations +of SOTA **offline reinforcement learning** algorithms. Each implementation is backed by a research-friendly codebase, allowing +you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL, +check them out too! The highlight features of CORL are:
+ +* 📜 Single-file implementation +* 📈 Benchmarked Implementation (11+ offline algorithms, 5+ offline-to-online algorithms, 30+ datasets with detailed logs :material-arm-flex:) +* 🖼 [Weights and Biases](https://wandb.ai/site) integration + +You can read more about CORL design and main results in our [technical paper](https://arxiv.org/abs/2210.07105). + + +!!! tip + ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/corl-team/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment. + + +!!! info + **Minari** and **Gymnasium** support: [Farama-Foundation/Minari](https://github.com/Farama-Foundation/Minari) is the + next generation of D4RL that will continue to be maintained and introduce new features and datasets. + Please see their [announcement](https://farama.org/Announcing-Minari) for further detail. + We are currently slowly migrating to the Minari and the progress + can be tracked [here](https://github.com/corl-team/CORL/issues/2). This will allow us to significantly update dependencies + and simplify installation, and give users access to many new datasets out of the box! + + +!!! warning + CORL (similarily to CleanRL) is not a modular library and therefore it is not meant to be imported. + At the cost of duplicate code, we make all implementation details of an ORL algorithm variant easy + to understand. You should consider using CORL if you want to 1) understand and control all implementation details + of an algorithm or 2) rapidly prototype advanced features that other modular ORL libraries do not support. + + +## Algorithms Implemented + +| Algorithm | Variants Implemented | Wandb Report | +|--------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- | +| **Offline and Offline-to-Online** | | +| ✅ [Conservative Q-Learning for Offline Reinforcement Learning
(CQL)](https://arxiv.org/abs/2006.04779) | :material-github: [`offline/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/cql.py)
:material-github: [`finetune/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cql.py)
:material-file-document: [docs](algorithms/cql.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-CQL--VmlldzoyNzA2MTk5)
:material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-CQL--Vmlldzo0NTQ3NTMz) +| ✅ [Accelerating Online Reinforcement Learning with Offline Datasets
(AWAC)](https://arxiv.org/abs/2006.09359) | :material-github: [`offline/awac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/awac.py)
:material-github: [`finetune/awac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/awac.py)
:material-file-document: [docs](algorithms/awac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-AWAC--VmlldzoyNzA2MjE3)
:material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-AWAC--VmlldzozODAyNzQz) +| ✅ [Offline Reinforcement Learning with Implicit Q-Learning
(IQL)](https://arxiv.org/abs/2110.06169) | :material-github: [`offline/iql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/iql.py)
:material-github: [`finetune/iql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/iql.py)
:material-file-document: [docs](algorithms/iql.md) |:material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-IQL--VmlldzoyNzA2MTkx)
:material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-IQL--VmlldzozNzE1MTEy) +| **Offline-to-Online only** | | +| ✅ [Supported Policy Optimization for Offline Reinforcement Learning
(SPOT)](https://arxiv.org/abs/2202.06239) | :material-github: [`finetune/spot.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/spot.py)
:material-file-document: [docs](algorithms/spot.md) | :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-SPOT--VmlldzozODk5MTgx) +| ✅ [Cal-QL: Calibrated Offline RL Pre-Training for Efficient Online Fine-Tuning
(Cal-QL)](https://arxiv.org/abs/2303.05479) | :material-github: [`finetune/cal_ql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cal_ql.py)
:material-file-document: [docs](algorithms/cal-ql.md) | :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-Cal-QL--Vmlldzo0NTQ3NDk5) +| **Offline only** | | +| ✅ Behavioral Cloning
(BC) | :material-github: [`offline/any_percent_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/any_percent_bc.py)
:material-file-document: [docs](algorithms/bc.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC--VmlldzoyNzA2MjE1) +| ✅ Behavioral Cloning-10%
(BC-10%) | :material-github: [`offline/any_percent_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/any_percent_bc.py)
:material-file-document: [docs](algorithms/bc.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC-10---VmlldzoyNzEwMjcx) +| ✅ [A Minimalist Approach to Offline Reinforcement Learning
(TD3+BC)](https://arxiv.org/abs/2106.06860) | :material-github: [`offline/td3_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/td3_bc.py)
:material-file-document: [docs](algorithms/td3-bc.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-TD3-BC--VmlldzoyNzA2MjA0) +| ✅ [Decision Transformer: Reinforcement Learning via Sequence Modeling
(DT)](https://arxiv.org/abs/2106.01345) | :material-github: [`offline/dt.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)
:material-file-document: [docs](algorithms/dt.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-Decision-Transformer--VmlldzoyNzA2MTk3) +| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(SAC-N)](https://arxiv.org/abs/2110.01548) | :material-github: [`offline/sac_n.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/sac_n.py)
:material-file-document: [docs](algorithms/sac-n.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-SAC-N--VmlldzoyNzA1NTY1) +| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(EDAC)](https://arxiv.org/abs/2110.01548) | :material-github: [`offline/edac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/edac.py)
:material-file-document: [docs](algorithms/edac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-EDAC--VmlldzoyNzA5ODUw) +| ✅ [Revisiting the Minimalist Approach to Offline Reinforcement Learning
(ReBRAC)](https://arxiv.org/abs/2305.09836) | :material-github: [`offline/rebrac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/rebrac.py)
:material-file-document: [docs](algorithms/rebrac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-ReBRAC--Vmlldzo0ODkzOTQ2) +| ✅ [Q-Ensemble for Offline RL: Don't Scale the Ensemble, Scale the Batch Size
(LB-SAC)](https://arxiv.org/abs/2211.11092) | :material-github: [`offline/lb_sac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/lb_sac.py)
:material-file-document: [docs](algorithms/lb-sac.md) | :material-chart-box: [`Offline Gym-MuJoCo`](https://wandb.ai/tlab/CORL/reports/LB-SAC-D4RL-Results--VmlldzozNjIxMDY1) + +## Citing CORL +If you use CORL in your work, please use the following bibtex +```bibtex +@inproceedings{ +tarasov2022corl, + title={CORL: Research-oriented Deep Offline Reinforcement Learning Library}, + author={Denis Tarasov and Alexander Nikulin and Dmitry Akimov and Vladislav Kurenkov and Sergey Kolesnikov}, + booktitle={3rd Offline RL Workshop: Offline RL as a ''Launchpad''}, + year={2022}, + url={https://openreview.net/forum?id=SyAS49bBcv} +} +``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..bda5b719 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,85 @@ +site_name: Clean Offline RL +theme: + name: material + logo: assets/logo.jpeg + palette: + # Palette toggle for light mode + - scheme: default + toggle: + icon: material/toggle-switch + name: Switch to dark mode + + # Palette toggle for dark mode + - scheme: slate + toggle: + icon: material/toggle-switch-off-outline + name: Switch to light mode + features: + - navigation.instant + - navigation.tracking + - navigation.sections + - navigation.expand + - navigation.path +# - toc.integrate + - navigation.top + - search.suggest + - search.highlight + - header.autohide + - content.code.copy + - content.code.annotate + +copyright: Copyright © 2022, CORL Team + +repo_url: https://github.com/corl-team/CORL +repo_name: corl-team/CORL + +markdown_extensions: + - meta + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.tasklist: + custom_checkbox: true + clickable_checkbox: false + - pymdownx.arithmatex: + generic: true + +extra_javascript: + - javascripts/mathjax.js + - https://polyfill.io/v3/polyfill.min.js?features=es6 + - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js + +nav: + - Overview: index.md + - Get Started: + - get-started/install.md + - get-started/usage.md + - Benchmarks: + - benchmarks/offline.md + - benchmarks/offline-to-online.md + - benchmarks/repro.md + - Algorithms: + - algorithms/bc.md + - algorithms/td3-bc.md + - algorithms/dt.md + - algorithms/sac-n.md + - algorithms/edac.md + - algorithms/rebrac.md + - algorithms/lb-sac.md + - algorithms/cql.md + - algorithms/awac.md + - algorithms/iql.md + - algorithms/cal-ql.md + - algorithms/spot.md + - Community: + - community/contrib.md + - community/publications.md \ No newline at end of file diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt index e0e489b0..16d0cc05 100644 --- a/requirements/requirements_dev.txt +++ b/requirements/requirements_dev.txt @@ -10,6 +10,7 @@ torch==1.11.0+cu113 pyrallis==0.3.1 pre-commit==3.3.3 ruff==0.0.278 +mkdocs-material==9.1.21 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html jax==0.4.1 jaxlib[cuda11_cudnn82]==0.4.1 diff --git a/results/get_offline_scores.py b/results/get_offline_scores.py index 0e165a97..fe5e8570 100644 --- a/results/get_offline_scores.py +++ b/results/get_offline_scores.py @@ -44,7 +44,11 @@ def process_runs(df): df.iterrows(), desc="Runs scores downloading", position=0, leave=True ): full_scores[row["algorithm"]][row["dataset"]].append( - get_run_scores(row["url"], row["algorithm"] == "DT", row["algorithm"] == "AWAC") + get_run_scores( + row["url"], + row["algorithm"] == "DT", + row["algorithm"] == "AWAC" + ) ) return full_scores