diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..0481485d
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,25 @@
+name: ci
+on:
+ push:
+ branches:
+ - main
+ - howuhh/docs-wip
+permissions:
+ contents: write
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: 3.x
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+ - uses: actions/cache@v3
+ with:
+ key: mkdocs-material-${{ env.cache_id }}
+ path: .cache
+ restore-keys: |
+ mkdocs-material-
+ - run: pip install mkdocs-material
+ - run: mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
index ce35dd32..03469487 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,4 +145,4 @@ dmypy.json
.json
.yaml
wandb
-assets/
\ No newline at end of file
+#assets/
\ No newline at end of file
diff --git a/README.md b/README.md
index 2792adfc..36f74e2b 100644
--- a/README.md
+++ b/README.md
@@ -10,16 +10,25 @@
🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations of SOTA ORL algorithms. Each implementation is backed by a research-friendly codebase, allowing you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL, check them out too!
* 📜 Single-file implementation
-* 📈 Benchmarked Implementation for N algorithms
+* 📈 Benchmarked Implementation (11+ offline algorithms, 5+ offline-to-online algorithms, 30+ datasets with detailed logs)
* 🖼 [Weights and Biases](https://wandb.ai/site) integration
+You can read more about CORL design and main results in our [technical paper](https://arxiv.org/abs/2210.07105).
+
----
* ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/corl-team/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment.
----
+> ⚠️ **NOTE**: CORL (similarily to CleanRL) is not a modular library and therefore it is not meant to be imported.
+At the cost of duplicate code, we make all implementation details of an ORL algorithm variant easy
+to understand. You should consider using CORL if you want to 1) understand and control all implementation details
+of an algorithm or 2) rapidly prototype advanced features that other modular ORL libraries do not support.
+
## Getting started
+Please refer to the [documentation](https://corl-team.github.io/CORL/get-started/install/) for more details. TLDR:
+
```bash
git clone https://github.com/corl-team/CORL.git && cd CORL
pip install -r requirements/requirements_dev.txt
@@ -213,7 +222,7 @@ If you use CORL in your work, please use the following bibtex
```bibtex
@inproceedings{
tarasov2022corl,
- title={{CORL}: Research-oriented Deep Offline Reinforcement Learning Library},
+ title={CORL: Research-oriented Deep Offline Reinforcement Learning Library},
author={Denis Tarasov and Alexander Nikulin and Dmitry Akimov and Vladislav Kurenkov and Sergey Kolesnikov},
booktitle={3rd Offline RL Workshop: Offline RL as a ''Launchpad''},
year={2022},
diff --git a/algorithms/offline/any_percent_bc.py b/algorithms/offline/any_percent_bc.py
index edacc43e..7b6dfa83 100644
--- a/algorithms/offline/any_percent_bc.py
+++ b/algorithms/offline/any_percent_bc.py
@@ -19,26 +19,40 @@
@dataclass
class TrainConfig:
- # Experiment
- device: str = "cuda"
- env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
- seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
- eval_freq: int = int(5e3) # How often (time steps) we evaluate
- n_episodes: int = 10 # How many episodes run during evaluation
- max_timesteps: int = int(1e6) # Max time steps to run environment
- checkpoints_path: Optional[str] = None # Save path
- load_model: str = "" # Model load file name, "" doesn't load
- batch_size: int = 256 # Batch size for all networks
- discount: float = 0.99 # Discount factor
- # BC
- buffer_size: int = 2_000_000 # Replay buffer size
- frac: float = 0.1 # Best data fraction to use
- max_traj_len: int = 1000 # Max trajectory length
- normalize: bool = True # Normalize states
- # Wandb logging
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "BC-D4RL"
+ # wandb run name
name: str = "BC"
+ # training dataset and evaluation environment
+ env: str = "halfcheetah-medium-expert-v2"
+ # total gradient updates during training
+ max_timesteps: int = int(1e6)
+ # training batch size
+ batch_size: int = 256
+ # maximum size of the replay buffer
+ buffer_size: int = 2_000_000
+ # what top fraction of the dataset (sorted by return) to use
+ frac: float = 0.1
+ # maximum possible trajectory length
+ max_traj_len: int = 1000
+ # whether to normalize states
+ normalize: bool = True
+ # discount factor
+ discount: float = 0.99
+ # evaluation frequency, will evaluate eval_freq training steps
+ eval_freq: int = int(5e3)
+ # number of episodes to run during evaluation
+ n_episodes: int = 10
+ # path for checkpoints saving, optional
+ checkpoints_path: Optional[str] = None
+ # file name for loading a model, optional
+ load_model: str = ""
+ # training random seed
+ seed: int = 0
+ # training device
+ device: str = "cuda"
def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
diff --git a/algorithms/offline/awac.py b/algorithms/offline/awac.py
index 2c652de9..2fb3bf10 100644
--- a/algorithms/offline/awac.py
+++ b/algorithms/offline/awac.py
@@ -20,29 +20,49 @@
@dataclass
class TrainConfig:
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "AWAC-D4RL"
+ # wandb run name
name: str = "AWAC"
- checkpoints_path: Optional[str] = None
-
+ # training dataset and evaluation environment
env_name: str = "halfcheetah-medium-expert-v2"
- seed: int = 42
- test_seed: int = 69
- deterministic_torch: bool = False
- device: str = "cuda"
-
- buffer_size: int = 2_000_000
- num_train_ops: int = 1_000_000
- batch_size: int = 256
- eval_frequency: int = 1000
- n_test_episodes: int = 10
- normalize_reward: bool = False
-
+ # actor and critic hidden dim
hidden_dim: int = 256
+ # actor and critic learning rate
learning_rate: float = 3e-4
+ # discount factor
gamma: float = 0.99
+ # coefficient for the target critic Polyak's update
tau: float = 5e-3
+ # awac actor loss temperature, controlling balance
+ # between behaviour cloning and Q-value maximization
awac_lambda: float = 1.0
+ # total number of gradient updated during training
+ num_train_ops: int = 1_000_000
+ # training batch size
+ batch_size: int = 256
+ # maximum size of the replay buffer
+ buffer_size: int = 2_000_000
+ # whether to normalize reward (like in IQL)
+ normalize_reward: bool = False
+ # evaluation frequency, will evaluate every eval_frequency
+ # training steps
+ eval_frequency: int = 1000
+ # number of episodes to run during evaluation
+ n_test_episodes: int = 10
+ # path for checkpoints saving, optional
+ checkpoints_path: Optional[str] = None
+ # configure PyTorch to use deterministic algorithms instead
+ # of nondeterministic ones
+ deterministic_torch: bool = False
+ # training random seed
+ seed: int = 42
+ # evaluation random seed
+ test_seed: int = 69
+ # training device
+ device: str = "cuda"
def __post_init__(self):
self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}"
diff --git a/algorithms/offline/cql.py b/algorithms/offline/cql.py
index a1470eb0..307ffc08 100644
--- a/algorithms/offline/cql.py
+++ b/algorithms/offline/cql.py
@@ -23,7 +23,6 @@
@dataclass
class TrainConfig:
- # Experiment
device: str = "cuda"
env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
@@ -32,8 +31,6 @@ class TrainConfig:
max_timesteps: int = int(1e6) # Max time steps to run environment
checkpoints_path: Optional[str] = None # Save path
load_model: str = "" # Model load file name, "" doesn't load
-
- # CQL
buffer_size: int = 2_000_000 # Replay buffer size
batch_size: int = 256 # Batch size for all networks
discount: float = 0.99 # Discount factor
@@ -59,9 +56,7 @@ class TrainConfig:
q_n_hidden_layers: int = 3 # Number of hidden layers in Q networks
reward_scale: float = 1.0 # Reward scale for normalization
reward_bias: float = 0.0 # Reward bias for normalization
-
- # AntMaze hacks
- bc_steps: int = int(0) # Number of BC steps at start
+ bc_steps: int = int(0) # Number of BC steps at start (AntMaze hacks)
reward_scale: float = 5.0
reward_bias: float = -1.0
policy_log_std_multiplier: float = 1.0
diff --git a/algorithms/offline/dt.py b/algorithms/offline/dt.py
index 37c61e67..367c337e 100644
--- a/algorithms/offline/dt.py
+++ b/algorithms/offline/dt.py
@@ -1,5 +1,5 @@
# inspiration:
-# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py # noqa
+# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py
# 2. https://github.com/karpathy/minGPT
import os
import random
@@ -17,44 +17,70 @@
import wandb
from torch.nn import functional as F
from torch.utils.data import DataLoader, IterableDataset
-from tqdm.auto import tqdm, trange # noqa
+from tqdm.auto import trange
@dataclass
class TrainConfig:
- # wandb params
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "DT-D4RL"
+ # wandb run name
name: str = "DT"
- # model params
+ # transformer hidden dim
embedding_dim: int = 128
+ # depth of the transformer model
num_layers: int = 3
+ # number of heads in the attention
num_heads: int = 1
+ # maximum sequence length during training
seq_len: int = 20
+ # maximum rollout length, needed for the positional embeddings
episode_len: int = 1000
+ # attention dropout
attention_dropout: float = 0.1
+ # residual dropout
residual_dropout: float = 0.1
+ # embeddings dropout
embedding_dropout: float = 0.1
+ # maximum range for the symmetric actions, [-1, 1]
max_action: float = 1.0
- # training params
+ # training dataset and evaluation environment
env_name: str = "halfcheetah-medium-v2"
+ # AdamW optimizer learning rate
learning_rate: float = 1e-4
+ # AdamW optimizer betas
betas: Tuple[float, float] = (0.9, 0.999)
+ # AdamW weight decay
weight_decay: float = 1e-4
+ # maximum gradient norm during training, optional
clip_grad: Optional[float] = 0.25
+ # training batch size
batch_size: int = 64
+ # total training steps
update_steps: int = 100_000
+ # warmup steps for the learning rate scheduler
warmup_steps: int = 10_000
+ # reward scaling, to reduce the magnitude
reward_scale: float = 0.001
+ # number of workers for the pytorch dataloader
num_workers: int = 4
- # evaluation params
+ # target return-to-go for the prompting durint evaluation
target_returns: Tuple[float, ...] = (12000.0, 6000.0)
+ # number of episodes to run during evaluation
eval_episodes: int = 100
+ # evaluation frequency, will evaluate eval_every training steps
eval_every: int = 10_000
- # general params
+ # path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
+ # configure PyTorch to use deterministic algorithms instead
+ # of nondeterministic ones
deterministic_torch: bool = False
+ # training random seed
train_seed: int = 10
+ # evaluation random seed
eval_seed: int = 42
+ # training device
device: str = "cuda"
def __post_init__(self):
@@ -180,7 +206,7 @@ def __prepare_sample(self, traj_idx, start_idx):
states = (states - self.state_mean) / self.state_std
returns = returns * self.reward_scale
- # pad up to seq_len if needed
+ # pad up to seq_len if needed, padding is masked during training
mask = np.hstack(
[np.ones(states.shape[0]), np.zeros(self.seq_len - states.shape[0])]
)
diff --git a/algorithms/offline/edac.py b/algorithms/offline/edac.py
index 413801c9..b668e43f 100644
--- a/algorithms/offline/edac.py
+++ b/algorithms/offline/edac.py
@@ -21,36 +21,58 @@
@dataclass
class TrainConfig:
- # wandb params
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "EDAC-D4RL"
+ # wandb run name
name: str = "EDAC"
- # model params
+ # actor and critic hidden dim
hidden_dim: int = 256
+ # critic ensemble size
num_critics: int = 10
+ # discount factor
gamma: float = 0.99
+ # coefficient for the target critic Polyak's update
tau: float = 5e-3
+ # coefficient for the ensemble diversification loss
eta: float = 1.0
+ # actor learning rate
actor_learning_rate: float = 3e-4
+ # critic learning rate
critic_learning_rate: float = 3e-4
+ # alpha learning rate
alpha_learning_rate: float = 3e-4
+ # maximum range for the symmetric actions, [-1, 1]
max_action: float = 1.0
- # training params
+ # maximum size of the replay buffer
buffer_size: int = 1_000_000
+ # training dataset and evaluation environment
env_name: str = "halfcheetah-medium-v2"
+ # training batch size
batch_size: int = 256
+ # total number of training epochs
num_epochs: int = 3000
+ # number of gradient updates during one epoch
num_updates_on_epoch: int = 1000
+ # whether to normalize reward (like in IQL)
normalize_reward: bool = False
- # evaluation params
+ # number of episodes to run during evaluation
eval_episodes: int = 10
+ # evaluation frequency, will evaluate eval_every training steps
eval_every: int = 5
- # general params
+ # path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
+ # configure PyTorch to use deterministic algorithms instead
+ # of nondeterministic ones
deterministic_torch: bool = False
+ # training random seed
train_seed: int = 10
+ # evaluation random seed
eval_seed: int = 42
+ # frequency of metrics logging to the wandb
log_every: int = 100
+ # training device
device: str = "cpu"
def __post_init__(self):
diff --git a/algorithms/offline/iql.py b/algorithms/offline/iql.py
index 72c28726..14cec70a 100644
--- a/algorithms/offline/iql.py
+++ b/algorithms/offline/iql.py
@@ -29,33 +29,55 @@
@dataclass
class TrainConfig:
- # Experiment
- device: str = "cuda"
- env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
- seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
- eval_freq: int = int(5e3) # How often (time steps) we evaluate
- n_episodes: int = 10 # How many episodes run during evaluation
- max_timesteps: int = int(1e6) # Max time steps to run environment
- checkpoints_path: Optional[str] = None # Save path
- load_model: str = "" # Model load file name, "" doesn't load
- # IQL
- buffer_size: int = 2_000_000 # Replay buffer size
- batch_size: int = 256 # Batch size for all networks
- discount: float = 0.99 # Discount factor
- tau: float = 0.005 # Target network update rate
- beta: float = 3.0 # Inverse temperature. Small beta -> BC, big beta -> maximizing Q
- iql_tau: float = 0.7 # Coefficient for asymmetric loss
- iql_deterministic: bool = False # Use deterministic actor
- normalize: bool = True # Normalize states
- normalize_reward: bool = False # Normalize reward
- vf_lr: float = 3e-4 # V function learning rate
- qf_lr: float = 3e-4 # Critic learning rate
- actor_lr: float = 3e-4 # Actor learning rate
- actor_dropout: Optional[float] = None # Adroit uses dropout for policy network
- # Wandb logging
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "IQL-D4RL"
+ # wandb run name
name: str = "IQL"
+ # training dataset and evaluation environment
+ env: str = "halfcheetah-medium-expert-v2"
+ # discount factor
+ discount: float = 0.99
+ # coefficient for the target critic Polyak's update
+ tau: float = 0.005
+ # actor update inverse temperature, similar to AWAC
+ # small beta -> BC, big beta -> maximizing Q-value
+ beta: float = 3.0
+ # coefficient for asymmetric critic loss
+ iql_tau: float = 0.7
+ # whether to use deterministic actor
+ iql_deterministic: bool = False
+ # total gradient updates during training
+ max_timesteps: int = int(1e6)
+ # maximum size of the replay buffer
+ buffer_size: int = 2_000_000
+ # training batch size
+ batch_size: int = 256
+ # whether to normalize states
+ normalize: bool = True
+ # whether to normalize reward (like in IQL)
+ normalize_reward: bool = False
+ # V-critic function learning rate
+ vf_lr: float = 3e-4
+ # Q-critic learning rate
+ qf_lr: float = 3e-4
+ # actor learning rate
+ actor_lr: float = 3e-4
+ # where to use dropout for policy network, optional
+ actor_dropout: Optional[float] = None
+ # evaluation frequency, will evaluate every eval_freq training steps
+ eval_freq: int = int(5e3)
+ # number of episodes to run during evaluation
+ n_episodes: int = 10
+ # path for checkpoints saving, optional
+ checkpoints_path: Optional[str] = None
+ # file name for loading a model, optional
+ load_model: str = ""
+ # training random seed
+ seed: int = 0
+ # training device
+ device: str = "cuda"
def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
diff --git a/algorithms/offline/lb_sac.py b/algorithms/offline/lb_sac.py
index 71fb8c54..4f10f77e 100644
--- a/algorithms/offline/lb_sac.py
+++ b/algorithms/offline/lb_sac.py
@@ -23,36 +23,58 @@
# base learning rate: 3e-4
@dataclass
class TrainConfig:
- # wandb params
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "LB-SAC"
+ # wandb run name
name: str = "LB-SAC"
- # model params
+ # actor and critic hidden dim
hidden_dim: int = 256
+ # critic ensemble size
num_critics: int = 10
+ # discount factor
gamma: float = 0.99
+ # coefficient for the target critic Polyak's update
tau: float = 5e-3
+ # actor learning rate (before scaling was 3e-4)
actor_learning_rate: float = 0.0018
+ # critic learning rate (before scaling was 3e-4)
critic_learning_rate: float = 0.0018
+ # alpha learning rate (before scaling was 3e-4)
alpha_learning_rate: float = 0.0018
+ # whether to use layer normalization for critic
critic_layernorm: bool = False
+ # whether to use initialization from EDAC paper
edac_init: bool = False
+ # maximum range for the symmetric actions, [-1, 1]
max_action: float = 1.0
- # training params
+ # maximum size of the replay buffer
buffer_size: int = 1_000_000
+ # training dataset and evaluation environment
env_name: str = "halfcheetah-medium-v2"
+ # training batch size
batch_size: int = 10_000
+ # total number of training epochs
num_epochs: int = 300
+ # number of gradient updates during one epoch
num_updates_on_epoch: int = 1000
- # evaluation params
+ # number of episodes to run during evaluation
eval_episodes: int = 10
+ # evaluation frequency, will evaluate eval_every training steps
eval_every: int = 5
- # general params
+ # path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
+ # configure PyTorch to use deterministic algorithms instead
+ # of nondeterministic ones
deterministic_torch: bool = False
+ # training random seed
train_seed: int = 10
+ # evaluation random seed
eval_seed: int = 42
+ # frequency of metrics logging to the wandb
log_every: int = 100
+ # training device
device: str = "cpu"
def __post_init__(self):
diff --git a/algorithms/offline/sac_n.py b/algorithms/offline/sac_n.py
index 0b91ddec..a44da091 100644
--- a/algorithms/offline/sac_n.py
+++ b/algorithms/offline/sac_n.py
@@ -21,36 +21,57 @@
@dataclass
class TrainConfig:
- # wandb params
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "SAC-N"
+ # wandb run name
name: str = "SAC-N"
- # model params
+ # actor and critic hidden dim
hidden_dim: int = 256
+ # critic ensemble size
num_critics: int = 10
+ # discount factor
gamma: float = 0.99
+ # coefficient for the target critic Polyak's update
tau: float = 5e-3
+ # actor learning rate
actor_learning_rate: float = 3e-4
+ # critic learning rate
critic_learning_rate: float = 3e-4
+ # entropy coefficient learning rate for automatic tuning
alpha_learning_rate: float = 3e-4
+ # maximum range for the symmetric actions, [-1, 1]
max_action: float = 1.0
- # training params
+ # maximum size of the replay buffer
buffer_size: int = 1_000_000
+ # training dataset and evaluation environment
env_name: str = "halfcheetah-medium-v2"
+ # training batch size
batch_size: int = 256
+ # total number of training epochs
num_epochs: int = 3000
+ # number of gradient updates during one epoch
num_updates_on_epoch: int = 1000
+ # whether to normalize reward (like in IQL)
normalize_reward: bool = False
- # evaluation params
+ # number of episodes to run during evaluation
eval_episodes: int = 10
+ # evaluation frequency, will evaluate eval_every training steps
eval_every: int = 5
- # general params
+ # path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
+ # configure PyTorch to use deterministic algorithms instead
+ # of nondeterministic ones
deterministic_torch: bool = False
+ # training random seed
train_seed: int = 10
+ # evaluation random seed
eval_seed: int = 42
+ # frequency of metrics logging to the wandb
log_every: int = 100
- device: str = "cpu"
+ # training device
+ device: str = "cuda"
def __post_init__(self):
self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}"
@@ -465,6 +486,8 @@ def eval_actor(
return np.array(episode_rewards)
+# normalization like in the IQL paper
+# https://github.com/ikostrikov/implicit_q_learning/blob/09d700248117881a75cb21f0adb95c6c8a694cb2/train_offline.py#L35 # noqa
def return_reward_range(dataset, max_episode_steps):
returns, lengths = [], []
ep_ret, ep_len = 0.0, 0
diff --git a/algorithms/offline/td3_bc.py b/algorithms/offline/td3_bc.py
index a78bda30..6ae0379c 100644
--- a/algorithms/offline/td3_bc.py
+++ b/algorithms/offline/td3_bc.py
@@ -22,32 +22,51 @@
@dataclass
class TrainConfig:
- # Experiment
- device: str = "cuda"
- env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
- seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
- eval_freq: int = int(5e3) # How often (time steps) we evaluate
- n_episodes: int = 10 # How many episodes run during evaluation
- max_timesteps: int = int(1e6) # Max time steps to run environment
- checkpoints_path: Optional[str] = None # Save path
- load_model: str = "" # Model load file name, "" doesn't load
- # TD3
- buffer_size: int = 2_000_000 # Replay buffer size
- batch_size: int = 256 # Batch size for all networks
- discount: float = 0.99 # Discount ffor
- expl_noise: float = 0.1 # Std of Gaussian exploration noise
- tau: float = 0.005 # Target network update rate
- policy_noise: float = 0.2 # Noise added to target actor during critic update
- noise_clip: float = 0.5 # Range to clip target actor noise
- policy_freq: int = 2 # Frequency of delayed actor updates
- # TD3 + BC
- alpha: float = 2.5 # Coefficient for Q function in actor loss
- normalize: bool = True # Normalize states
- normalize_reward: bool = False # Normalize reward
- # Wandb logging
+ # wandb project name
project: str = "CORL"
+ # wandb group name
group: str = "TD3_BC-D4RL"
+ # wandb run name
name: str = "TD3_BC"
+ # training dataset and evaluation environment
+ env: str = "halfcheetah-medium-expert-v2"
+ # coefficient for the Q-function in actor loss
+ alpha: float = 2.5
+ # discount factor
+ discount: float = 0.99
+ # standard deviation for the gaussian exploration noise
+ expl_noise: float = 0.1
+ # coefficient for the target critic Polyak's update
+ tau: float = 0.005
+ # scalig coefficient for the noise added to
+ # target actor during critic update
+ policy_noise: float = 0.2
+ # range for the target actor noise clipping
+ noise_clip: float = 0.5
+ # actor update delay
+ policy_freq: int = 2
+ # total gradient updates during training
+ max_timesteps: int = int(1e6)
+ # maximum size of the replay buffer
+ buffer_size: int = 2_000_000
+ # training batch size
+ batch_size: int = 256
+ # whether to normalize states
+ normalize: bool = True
+ # whether to normalize reward (like in IQL)
+ normalize_reward: bool = False
+ # evaluation frequency, will evaluate every eval_freq training steps
+ eval_freq: int = int(5e3)
+ # number of episodes to run during evaluation
+ n_episodes: int = 10
+ # path for checkpoints saving, optional
+ checkpoints_path: Optional[str] = None
+ # file name for loading a model, optional
+ load_model: str = ""
+ # training random seed
+ seed: int = 0
+ # training device
+ device: str = "cuda"
def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
diff --git a/docs/algorithms/awac.md b/docs/algorithms/awac.md
new file mode 100644
index 00000000..94731ee0
--- /dev/null
+++ b/docs/algorithms/awac.md
@@ -0,0 +1 @@
+# AWAC
diff --git a/docs/algorithms/bc.md b/docs/algorithms/bc.md
new file mode 100644
index 00000000..7f00d30c
--- /dev/null
+++ b/docs/algorithms/bc.md
@@ -0,0 +1 @@
+# BC
\ No newline at end of file
diff --git a/docs/algorithms/cal-ql.md b/docs/algorithms/cal-ql.md
new file mode 100644
index 00000000..798de1ef
--- /dev/null
+++ b/docs/algorithms/cal-ql.md
@@ -0,0 +1 @@
+# Cal-QL
diff --git a/docs/algorithms/cql.md b/docs/algorithms/cql.md
new file mode 100644
index 00000000..194cc7a1
--- /dev/null
+++ b/docs/algorithms/cql.md
@@ -0,0 +1 @@
+# CQL
\ No newline at end of file
diff --git a/docs/algorithms/dt.md b/docs/algorithms/dt.md
new file mode 100644
index 00000000..e3218c9d
--- /dev/null
+++ b/docs/algorithms/dt.md
@@ -0,0 +1,130 @@
+---
+hide:
+ - toc # Hide table of contents
+---
+
+# DT
+
+## Overview
+
+The Decision Transformer (DT) model casts offline reinforcement learning as a conditional sequence modeling problem.
+
+Unlike prior approaches to offline RL that fit value functions or compute policy gradients, Decision Transformer simply outputs the optimal
+actions by leveraging a causally masked Transformer. By conditioning an autoregressive model on the desired return
+(reward-to-go), past states, and actions, Decision Transformer model can generate future actions that achieve the desired return.
+
+Original paper:
+
+ * [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
+ * [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
+ (similar approach, came out at the same time)
+
+Reference resources:
+
+* :material-github: [Official codebase for Decision Transformer](https://github.com/kzl/decision-transformer)
+
+!!! success
+ Due to the simple supervised objective and transformer architecture, Decision Transformer is simple, stable and easy to implement as it
+ has a minimum number of moving parts.
+
+!!! warning
+ Despite its simplicity and stability, DT has a number of drawbacks. It does not capable of stitching suboptimal
+ trajectories (that's why poor performance on AntMaze datasets), and can also [show](https://arxiv.org/abs/2205.15967) bad performance in stochastic environments.
+
+Possible extensions:
+
+* [Online Decision Transformer](https://arxiv.org/abs/2202.05607)
+* [Emergent Agentic Transformer from Chain of Hindsight Experience](https://arxiv.org/abs/2305.16554)
+* [Q-learning Decision Transformer: Leveraging Dynamic Programming for Conditional Sequence Modelling in Offline RL](https://proceedings.mlr.press/v202/yamagata23a.html)
+
+We'd be glad if someone would be interested in contributing them!
+
+## Implemented Variants
+
+| Variants Implemented | Description |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
+| :material-github: [`offline/dt.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/dt) | For continuous action spaces and offline RL without fine-tuning support. |
+
+
+## Explanation of logged metrics
+
+* `eval/{target_return}_return_mean`: mean undiscounted evaluation return when prompted with `config.target_return` value (there might be more than one)
+* `eval/{target_return}_return_std`: standard deviation of the undiscounted evaluation return across `config.eval_episodes` episodes
+* `eval/{target_return}_normalized_score_mean`: mean normalized score when prompted with `config.target_return` value (there might be more than one).
+ Should be between 0 and 100, where 100+ is the performance above expert for this environment.
+ Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)].
+* `eval/{target_return}_normalized_score_std`: standard deviation of the normalized score return across `config.eval_episodes` episodes
+* `train_loss`: current training loss, Mean squared error (MSE) for continuous action spaces
+* `learning_rate`: current learning rate, helps monitor learning rate schedule
+
+## Implementation details
+
+1. Batch sampling weighted by trajectory length (:material-github: [algorithms/offline/dt.py#L171](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L171))
+2. State normalization during training and inference (:material-github: [algorithms/offline/dt.py#L181](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L181))
+3. Reward downscaling (:material-github: [algorithms/offline/dt.py#L182](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L182))
+4. Positional embedding shared across one transition (:material-github: [algorithms/offline/dt.py#L323](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L323))
+5. Prompting with multiple return-to-go's during evaluation, as DT can be sensitive to the prompt (:material-github: [algorithms/offline/dt.py#L498](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L498))
+
+## Experimental results
+
+For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md).
+Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly.
+
+
+
+## Training options
+
+```commandline
+usage: dt.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--embedding_dim int] [--num_layers int]
+ [--num_heads int] [--seq_len int] [--episode_len int] [--attention_dropout float] [--residual_dropout float]
+ [--embedding_dropout float] [--max_action float] [--env_name str] [--learning_rate float]
+ [--betas float float] [--weight_decay float] [--clip_grad [float]] [--batch_size int] [--update_steps int]
+ [--warmup_steps int] [--reward_scale float] [--num_workers int] [--target_returns float [float, ...]]
+ [--eval_episodes int] [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool]
+ [--train_seed int] [--eval_seed int] [--device str]
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config_path str Path for a config file to parse with pyrallis (default: None)
+
+TrainConfig:
+
+ --project str wandb project name (default: CORL)
+ --group str wandb group name (default: DT-D4RL)
+ --name str wandb run name (default: DT)
+ --embedding_dim int transformer hidden dim (default: 128)
+ --num_layers int depth of the transformer model (default: 3)
+ --num_heads int number of heads in the attention (default: 1)
+ --seq_len int maximum sequence length during training (default: 20)
+ --episode_len int maximum rollout length, needed for the positional embeddings (default: 1000)
+ --attention_dropout float
+ attention dropout (default: 0.1)
+ --residual_dropout float
+ residual dropout (default: 0.1)
+ --embedding_dropout float
+ embeddings dropout (default: 0.1)
+ --max_action float maximum range for the symmetric actions, [-1, 1] (default: 1.0)
+ --env_name str training dataset and evaluation environment (default: halfcheetah-medium-v2)
+ --learning_rate float
+ AdamW optimizer learning rate (default: 0.0001)
+ --betas float float AdamW optimizer betas (default: (0.9, 0.999))
+ --weight_decay float AdamW weight decay (default: 0.0001)
+ --clip_grad [float] maximum gradient norm during training, optional (default: 0.25)
+ --batch_size int training batch size (default: 64)
+ --update_steps int total training steps (default: 100000)
+ --warmup_steps int warmup steps for the learning rate scheduler (default: 10000)
+ --reward_scale float reward scaling, to reduce the magnitude (default: 0.001)
+ --num_workers int number of workers for the pytorch dataloader (default: 4)
+ --target_returns float [float, ...]
+ target return-to-go for the prompting durint evaluation (default: (12000.0, 6000.0))
+ --eval_episodes int number of episodes to run during evaluation (default: 100)
+ --eval_every int evaluation frequency, will evaluate eval_every training steps (default: 10000)
+ --checkpoints_path [str]
+ path for checkpoints saving, optional (default: None)
+ --deterministic_torch bool
+ configure PyTorch to use deterministic algorithms instead of nondeterministic ones (default: False)
+ --train_seed int training random seed (default: 10)
+ --eval_seed int evaluation random seed (default: 42)
+ --device str training device (default: cuda)
+```
+
diff --git a/docs/algorithms/edac.md b/docs/algorithms/edac.md
new file mode 100644
index 00000000..68c4a050
--- /dev/null
+++ b/docs/algorithms/edac.md
@@ -0,0 +1 @@
+# EDAC
diff --git a/docs/algorithms/iql.md b/docs/algorithms/iql.md
new file mode 100644
index 00000000..d619a8c4
--- /dev/null
+++ b/docs/algorithms/iql.md
@@ -0,0 +1 @@
+# IQL
\ No newline at end of file
diff --git a/docs/algorithms/lb-sac.md b/docs/algorithms/lb-sac.md
new file mode 100644
index 00000000..4498896c
--- /dev/null
+++ b/docs/algorithms/lb-sac.md
@@ -0,0 +1 @@
+# LB-SAC
diff --git a/docs/algorithms/rebrac.md b/docs/algorithms/rebrac.md
new file mode 100644
index 00000000..6978645c
--- /dev/null
+++ b/docs/algorithms/rebrac.md
@@ -0,0 +1 @@
+# ReBRAC
\ No newline at end of file
diff --git a/docs/algorithms/sac-n.md b/docs/algorithms/sac-n.md
new file mode 100644
index 00000000..8c05636a
--- /dev/null
+++ b/docs/algorithms/sac-n.md
@@ -0,0 +1,153 @@
+---
+hide:
+ - toc # Hide table of contents
+---
+
+# SAC-N
+
+## Overview
+
+SAC-N is a simple extension of well known online Soft Actor Critic (SAC) algorithm. For an overview of online SAC,
+see the excellent [documentation at **CleanRL**](https://docs.cleanrl.dev/rl-algorithms/sac/). SAC utilizes a conventional
+technique from online RL, Clipped Double Q-learning, which uses the minimum value of two parallel Q-networks
+as the Bellman target. SAC-N modifies SAC by increasing the size of the Q-ensemble from $2$ to $N$ to prevent the overestimation.
+That's it!
+
+
+Critic loss (change in blue):
+
+$$
+\min _{\phi_i} \mathbb{E}_{\mathbf{s}, \mathbf{a}, \mathbf{s}^{\prime} \sim \mathcal{D}}\left[\left(Q_{\phi_i}(\mathbf{s}, \mathbf{a})-\left(r(\mathbf{s}, \mathbf{a})+\gamma \mathbb{E}_{\mathbf{a}^{\prime} \sim \pi_\theta\left(\cdot \mid \mathbf{s}^{\prime}\right)}\left[\min _{\color{blue}{j=1, \ldots, N}} Q_{\phi_j^{\prime}}\left(\mathbf{s}^{\prime}, \mathbf{a}^{\prime}\right)-\alpha \log \pi_\theta\left(\mathbf{a}^{\prime} \mid \mathbf{s}^{\prime}\right)\right]\right)\right)^2\right]
+$$
+
+Actor loss (change in blue):
+
+$$
+\max _\theta \mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \pi_\theta(\cdot \mid \mathbf{s})}\left[\min _{\color{blue}{j=1, \ldots, N}} Q_{\phi_j}(\mathbf{s}, \mathbf{a})-\alpha \log \pi_\theta(\mathbf{a} \mid \mathbf{s})\right]
+$$
+
+Why does it work? There is a simple intuition given in the original paper. The clipped Q-learning algorithm, which chooses the
+worst-case Q-value instead to compute the pessimistic estimate, can also be interpreted as utilizing the LCB of the Q-value
+predictions. Suppose $Q(s, a)$ follows a Gaussian distribution with mean $m(s, a)$ and standard deviation $\sigma(s, a)$. Also,
+let $\left\{Q_j(\mathbf{s}, \mathbf{a})\right\}_{j=1}^N$ be realizations of $Q(s, a)$. Then, we can approximate the expected minimum of the realizations as
+
+$$
+\mathbb{E}\left[\min _{j=1, \ldots, N} Q_j(\mathbf{s}, \mathbf{a})\right] \approx m(\mathbf{s}, \mathbf{a})-\Phi^{-1}\left(\frac{N-\frac{\pi}{8}}{N-\frac{\pi}{4}+1}\right) \sigma(\mathbf{s}, \mathbf{a})
+$$
+
+where $\Phi$ is the CDF of the standard Gaussian distribution. This relation indicates that using the clipped Q-value
+is similar to penalizing the ensemble mean of the Q-values with the standard deviation scaled by a coefficient dependent on $N$.
+For OOD actions, the standard deviation will be higher, and thus the penalty will be stronger, preventing divergence.
+
+Original paper:
+
+* [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble](https://arxiv.org/abs/2110.01548)
+
+Reference resources:
+
+* :material-github: [Official codebase for SAC-N and EDAC](https://github.com/snu-mllab/EDAC)
+
+
+!!! success
+ SAC-N is extremely simple extension of online SAC and works quite well out of box on majority of the benchmarks.
+ Usually only one parameter needs tuning - the size of the critics ensemble. It has SOTA results on the D4RL-Mujoco domain.
+
+!!! warning
+ Typically, SAC-N requires more time to converge, 3M updates instead of the usual 1M. Also, more complex tasks
+ may require a larger ensemble size, which will considerably increase training time. Finally,
+ SAC-N mysteriously does not work on the AntMaze domain. If you know how to fix this, let us know, it would be awesome!
+
+
+Possible extensions:
+
+* [Anti-Exploration by Random Network Distillation](https://arxiv.org/abs/2301.13616)
+* [Why So Pessimistic? Estimating Uncertainties for Offline RL through Ensembles, and Why Their Independence Matters](https://arxiv.org/abs/2205.13703)
+
+We'd be glad if someone would be interested in contributing them!
+
+## Implemented Variants
+
+| Variants Implemented | Description |
+|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
+| :material-github:[`offline/sac_n.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/sac_n.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/sac_n) | For continuous action spaces and offline RL without fine-tuning support. |
+
+
+## Explanation of logged metrics
+
+* `critic_loss`: sum of the Q-ensemble individual mean losses (for loss definition see above)
+* `actor_loss`: mean actor loss (for loss definition see above)
+* `alpha_loss`: entropy regularization coefficient loss for automatic policy entropy tuning (see **CleanRL** docs for more details)
+* `batch_entropy`: estimation of the policy distribution entropy based on the batch states
+* `alpha`: coefficient for entropy regularization of the policy
+* `q_policy_std`: standard deviation of the Q-ensemble on batch of states and policy actions
+* `q_random_std`: standard deviation of the Q-ensemble on batch of states and random (OOD) actions
+* `eval/reward_mean`: mean undiscounted evaluation return
+* `eval/reward_std`: standard deviation of the undiscounted evaluation return across `config.eval_episodes` episodes
+* `eval/normalized_score_mean`: mean evaluation normalized score. Should be between 0 and 100, where 100+ is the
+ performance above expert for this environment. Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)].
+* `eval/normalized_score_std`: standard deviation of the evaluation normalized score across `config.eval_episodes` episodes
+
+## Implementation details
+
+1. Efficient ensemble implementation with vectorized linear layers (:material-github:[algorithms/offline/sac_n.py#L174](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L174))
+2. Actor last layer initialization with small values (:material-github:[algorithms/offline/sac_n.py#L223](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L223))
+3. Critic last layer initialization with small values (but bigger than in actor) (:material-github:[algorithms/offline/sac_n.py#L283](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L283))
+4. Clipping bounds for actor `log_std` are different from original the online SAC (:material-github:[algorithms/offline/sac_n.py#L241](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L241))
+
+## Experimental results
+
+For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md).
+Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly.
+
+
+
+## Training options
+
+```commandline
+usage: sac_n.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--hidden_dim int] [--num_critics int]
+ [--gamma float] [--tau float] [--actor_learning_rate float] [--critic_learning_rate float]
+ [--alpha_learning_rate float] [--max_action float] [--buffer_size int] [--env_name str] [--batch_size int]
+ [--num_epochs int] [--num_updates_on_epoch int] [--normalize_reward bool] [--eval_episodes int]
+ [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool] [--train_seed int]
+ [--eval_seed int] [--log_every int] [--device str]
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config_path str Path for a config file to parse with pyrallis (default: None)
+
+TrainConfig:
+
+ --project str wandb project name (default: CORL)
+ --group str wandb group name (default: SAC-N)
+ --name str wandb run name (default: SAC-N)
+ --hidden_dim int actor and critic hidden dim (default: 256)
+ --num_critics int critic ensemble size (default: 10)
+ --gamma float discount factor (default: 0.99)
+ --tau float coefficient for the target critic Polyak's update (default: 0.005)
+ --actor_learning_rate float
+ actor learning rate (default: 0.0003)
+ --critic_learning_rate float
+ critic learning rate (default: 0.0003)
+ --alpha_learning_rate float
+ entropy coefficient learning rate for automatic tuning (default: 0.0003)
+ --max_action float maximum range for the symmetric actions, [-1, 1] (default: 1.0)
+ --buffer_size int maximum size of the replay buffer (default: 1000000)
+ --env_name str training dataset and evaluation environment (default: halfcheetah-medium-v2)
+ --batch_size int training batch size (default: 256)
+ --num_epochs int total number of training epochs (default: 3000)
+ --num_updates_on_epoch int
+ number of gradient updates during one epoch (default: 1000)
+ --normalize_reward bool
+ whether to normalize reward (like in IQL) (default: False)
+ --eval_episodes int number of episodes to run during evaluation (default: 10)
+ --eval_every int evaluation frequency, will evaluate eval_every training steps (default: 5)
+ --checkpoints_path [str]
+ path for checkpoints saving, optional (default: None)
+ --deterministic_torch bool
+ configure PyTorch to use deterministic algorithms instead of nondeterministic ones (default: False)
+ --train_seed int training random seed (default: 10)
+ --eval_seed int evaluation random seed (default: 42)
+ --log_every int frequency of metrics logging to the wandb (default: 100)
+ --device str training device (default: cpu)
+```
+
diff --git a/docs/algorithms/spot.md b/docs/algorithms/spot.md
new file mode 100644
index 00000000..bc991a94
--- /dev/null
+++ b/docs/algorithms/spot.md
@@ -0,0 +1 @@
+# SPOT
\ No newline at end of file
diff --git a/docs/algorithms/td3-bc.md b/docs/algorithms/td3-bc.md
new file mode 100644
index 00000000..8199ab8f
--- /dev/null
+++ b/docs/algorithms/td3-bc.md
@@ -0,0 +1 @@
+# TD3+BC
\ No newline at end of file
diff --git a/docs/assets/corl.pdf b/docs/assets/corl.pdf
new file mode 100644
index 00000000..d4eb872c
Binary files /dev/null and b/docs/assets/corl.pdf differ
diff --git a/docs/assets/logo.jpeg b/docs/assets/logo.jpeg
new file mode 100644
index 00000000..6fc0a734
Binary files /dev/null and b/docs/assets/logo.jpeg differ
diff --git a/docs/assets/perf_profiles_offline.pdf b/docs/assets/perf_profiles_offline.pdf
new file mode 100644
index 00000000..33c2cb0e
Binary files /dev/null and b/docs/assets/perf_profiles_offline.pdf differ
diff --git a/docs/assets/perf_profiles_online.pdf b/docs/assets/perf_profiles_online.pdf
new file mode 100644
index 00000000..c59684ac
Binary files /dev/null and b/docs/assets/perf_profiles_online.pdf differ
diff --git a/docs/benchmarks/offline-to-online.md b/docs/benchmarks/offline-to-online.md
new file mode 100644
index 00000000..32f9e4d4
--- /dev/null
+++ b/docs/benchmarks/offline-to-online.md
@@ -0,0 +1,60 @@
+---
+hide:
+ - toc # Hide table of contents
+---
+
+# Offline-to-online
+
+ Here, we report reproduced scores after offline pretraining and online fine-tuning for all datasets and offline-to-online algorithms considered.
+
+!!! tip
+ If you want to re-collect our results in a more structured/nuanced manner, see [how to reproduce](repro.md) section.
+
+## Scores
+
+### Antmaze
+| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|antmaze-umaze-v2|52.75 ± 8.67 → 98.75 ± 1.09|94.00 ± 1.58 → 99.50 ± 0.87|77.00 ± 0.71 → 96.50 ± 1.12|91.00 ± 2.55 → 99.50 ± 0.50|76.75 ± 7.53 → 99.75 ± 0.43|
+|antmaze-umaze-diverse-v2|56.00 ± 2.74 → 0.00 ± 0.00|9.50 ± 9.91 → 99.00 ± 1.22|59.50 ± 9.55 → 63.75 ± 25.02|36.25 ± 2.17 → 95.00 ± 3.67|32.00 ± 27.79 → 98.50 ± 1.12|
+|antmaze-medium-play-v2|0.00 ± 0.00 → 0.00 ± 0.00|59.00 ± 11.18 → 97.75 ± 1.30|71.75 ± 2.95 → 89.75 ± 1.09|67.25 ± 10.47 → 97.25 ± 1.30|71.75 ± 3.27 → 98.75 ± 1.64|
+|antmaze-medium-diverse-v2|0.00 ± 0.00 → 0.00 ± 0.00|63.50 ± 6.84 → 97.25 ± 1.92|64.25 ± 1.92 → 92.25 ± 2.86|73.75 ± 7.29 → 94.50 ± 1.66|62.00 ± 4.30 → 98.25 ± 1.48|
+|antmaze-large-play-v2|0.00 ± 0.00 → 0.00 ± 0.00|28.75 ± 7.76 → 88.25 ± 2.28|38.50 ± 8.73 → 64.50 ± 17.04|31.50 ± 12.58 → 87.00 ± 3.24|31.75 ± 8.87 → 97.25 ± 1.79|
+|antmaze-large-diverse-v2|0.00 ± 0.00 → 0.00 ± 0.00|35.50 ± 3.64 → 91.75 ± 3.96|26.75 ± 3.77 → 64.25 ± 4.15|17.50 ± 7.26 → 81.00 ± 14.14|44.00 ± 8.69 → 91.50 ± 3.91|
+| **average** |18.12 → 16.46|48.38 → 95.58|56.29 → 78.50|52.88 → 92.38|53.04 → 97.33|
+
+### Adroit
+| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|pen-cloned-v1|88.66 ± 15.10 → 86.82 ± 11.12|-2.76 ± 0.08 → -1.28 ± 2.16|84.19 ± 3.96 → 102.02 ± 20.75|6.19 ± 5.21 → 43.63 ± 20.09|-2.66 ± 0.04 → -2.68 ± 0.12|
+|door-cloned-v1|0.93 ± 1.66 → 0.01 ± 0.00|-0.33 ± 0.01 → -0.33 ± 0.01|1.19 ± 0.93 → 20.34 ± 9.32|-0.21 ± 0.14 → 0.02 ± 0.31|-0.33 ± 0.01 → -0.33 ± 0.01|
+|hammer-cloned-v1|1.80 ± 3.01 → 0.24 ± 0.04|0.56 ± 0.55 → 2.85 ± 4.81|1.35 ± 0.32 → 57.27 ± 28.49|3.97 ± 6.39 → 3.73 ± 4.99|0.25 ± 0.04 → 0.17 ± 0.17|
+|relocate-cloned-v1|-0.04 ± 0.04 → -0.04 ± 0.01|-0.33 ± 0.01 → -0.33 ± 0.01|0.04 ± 0.04 → 0.32 ± 0.38|-0.24 ± 0.01 → -0.15 ± 0.05|-0.31 ± 0.05 → -0.31 ± 0.04|
+| **average** |22.84 → 21.76|-0.72 → 0.22|21.69 → 44.99|2.43 → 11.81|-0.76 → -0.79|
+
+## Regrets
+
+### Antmaze
+| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|antmaze-umaze-v2|0.04 ± 0.01|0.02 ± 0.00|0.07 ± 0.00|0.02 ± 0.00|0.01 ± 0.00|
+|antmaze-umaze-diverse-v2|0.88 ± 0.01|0.09 ± 0.01|0.43 ± 0.11|0.22 ± 0.07|0.05 ± 0.01|
+|antmaze-medium-play-v2|1.00 ± 0.00|0.08 ± 0.01|0.09 ± 0.01|0.06 ± 0.00|0.04 ± 0.01|
+|antmaze-medium-diverse-v2|1.00 ± 0.00|0.08 ± 0.00|0.10 ± 0.01|0.05 ± 0.01|0.04 ± 0.01|
+|antmaze-large-play-v2|1.00 ± 0.00|0.21 ± 0.02|0.34 ± 0.05|0.29 ± 0.07|0.13 ± 0.02|
+|antmaze-large-diverse-v2|1.00 ± 0.00|0.21 ± 0.03|0.41 ± 0.03|0.23 ± 0.08|0.13 ± 0.02|
+| **average** |0.82|0.11|0.24|0.15|0.07|
+
+### Adroit
+
+| **Task-Name** |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|pen-cloned-v1|0.46 ± 0.02|0.97 ± 0.00|0.37 ± 0.01|0.58 ± 0.02|0.98 ± 0.01|
+|door-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.83 ± 0.03|0.99 ± 0.01|1.00 ± 0.00|
+|hammer-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.65 ± 0.10|0.98 ± 0.01|1.00 ± 0.00|
+|relocate-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|
+| **average** |0.86|0.99|0.71|0.89|0.99|
+
+## Visual summary
+
+![](../assets/perf_profiles_online.pdf)
\ No newline at end of file
diff --git a/docs/benchmarks/offline.md b/docs/benchmarks/offline.md
new file mode 100644
index 00000000..f83e0746
--- /dev/null
+++ b/docs/benchmarks/offline.md
@@ -0,0 +1,122 @@
+---
+hide:
+ - toc # Hide table of contents
+---
+
+# Offline
+
+ Here, we report reproduced **final** and **best** scores for all datasets and offline algorithms considered. Note that they differ by a significant
+ margin, and some papers may use different approaches, not making it always explicit which reporting methodology they chose.
+
+!!! tip
+ If you want to re-collect our results in a more structured/nuanced manner, see [how to reproduce](repro.md) section.
+
+## Last Scores
+### Gym-MuJoCo
+
+| **Task-Name**|BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N| EDAC |DT|
+|------------------------------|------------|--------|--------|--------|-----|-----|------|-------|----------------|----|
+|halfcheetah-medium-v2|42.40 ± 0.19|42.46 ± 0.70|48.10 ± 0.18|50.02 ± 0.27|47.04 ± 0.22|48.31 ± 0.22|64.04 ± 0.68|68.20 ± 1.28| 67.70 ± 1.04 |42.20 ± 0.26|
+|halfcheetah-medium-replay-v2|35.66 ± 2.33|23.59 ± 6.95|44.84 ± 0.59|45.13 ± 0.88|45.04 ± 0.27|44.46 ± 0.22|51.18 ± 0.31|60.70 ± 1.01| 62.06 ± 1.10 |38.91 ± 0.50|
+|halfcheetah-medium-expert-v2|55.95 ± 7.35|90.10 ± 2.45|90.78 ± 6.04|95.00 ± 0.61|95.63 ± 0.42|94.74 ± 0.52|103.80 ± 2.95|98.96 ± 9.31| 104.76 ± 0.64 |91.55 ± 0.95|
+|hopper-medium-v2|53.51 ± 1.76|55.48 ± 7.30|60.37 ± 3.49|63.02 ± 4.56|59.08 ± 3.77|67.53 ± 3.78|102.29 ± 0.17|40.82 ± 9.91| 101.70 ± 0.28 |65.10 ± 1.61|
+|hopper-medium-replay-v2|29.81 ± 2.07|70.42 ± 8.66|64.42 ± 21.52|98.88 ± 2.07|95.11 ± 5.27|97.43 ± 6.39|94.98 ± 6.53|100.33 ± 0.78| 99.66 ± 0.81 |81.77 ± 6.87|
+|hopper-medium-expert-v2|52.30 ± 4.01|111.16 ± 1.03|101.17 ± 9.07|101.90 ± 6.22|99.26 ± 10.91|107.42 ± 7.80|109.45 ± 2.34|101.31 ± 11.63| 105.19 ± 10.08 |110.44 ± 0.33|
+|walker2d-medium-v2|63.23 ± 16.24|67.34 ± 5.17|82.71 ± 4.78|68.52 ± 27.19|80.75 ± 3.28|80.91 ± 3.17|85.82 ± 0.77|87.47 ± 0.66| 93.36 ± 1.38 |67.63 ± 2.54|
+|walker2d-medium-replay-v2|21.80 ± 10.15|54.35 ± 6.34|85.62 ± 4.01|80.62 ± 3.58|73.09 ± 13.22|82.15 ± 3.03|84.25 ± 2.25|78.99 ± 0.50| 87.10 ± 2.78 |59.86 ± 2.73|
+|walker2d-medium-expert-v2|98.96 ± 15.98|108.70 ± 0.25|110.03 ± 0.36|111.44 ± 1.62|109.56 ± 0.39|111.72 ± 0.86|111.86 ± 0.43|114.93 ± 0.41| 114.75 ± 0.74 |107.11 ± 0.96|
+| **locomotion average** |50.40|69.29|76.45|79.39|78.28|81.63|89.74|83.52| **92.92** |73.84|
+
+### Maze2d
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC| SAC-N |EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|------|----------------|------|----|
+|maze2d-umaze-v1|0.36 ± 8.69|12.18 ± 4.29|29.41 ± 12.31|65.65 ± 5.34|-8.90 ± 6.11|42.11 ± 0.58|106.87 ± 22.16| 130.59 ± 16.52 |95.26 ± 6.39|18.08 ± 25.42|
+|maze2d-medium-v1|0.79 ± 3.25|14.25 ± 2.33|59.45 ± 36.25|84.63 ± 35.54|86.11 ± 9.68|34.85 ± 2.72|105.11 ± 31.67| 88.61 ± 18.72 |57.04 ± 3.45|31.71 ± 26.33|
+|maze2d-large-v1|2.26 ± 4.39|11.32 ± 5.10|97.10 ± 25.41|215.50 ± 3.11|23.75 ± 36.70|61.72 ± 3.50|78.33 ± 61.77| 204.76 ± 1.19 |95.60 ± 22.92|35.66 ± 28.20|
+| **maze2d average** |1.13|12.58|61.99|121.92|33.65|46.23|96.77| **141.32** |82.64|28.48|
+
+### Antmaze
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|antmaze-umaze-v2|55.25 ± 4.15|65.75 ± 5.26|70.75 ± 39.18|56.75 ± 9.09|92.75 ± 1.92|77.00 ± 5.52| 97.75 ± 1.48 |0.00 ± 0.00|0.00 ± 0.00|57.00 ± 9.82|
+|antmaze-umaze-diverse-v2|47.25 ± 4.09|44.00 ± 1.00|44.75 ± 11.61|54.75 ± 8.01|37.25 ± 3.70|54.25 ± 5.54| 83.50 ± 7.02 |0.00 ± 0.00|0.00 ± 0.00|51.75 ± 0.43|
+|antmaze-medium-play-v2|0.00 ± 0.00|2.00 ± 0.71|0.25 ± 0.43|0.00 ± 0.00|65.75 ± 11.61|65.75 ± 11.71| 89.50 ± 3.35 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-medium-diverse-v2|0.75 ± 0.83|5.75 ± 9.39|0.25 ± 0.43|0.00 ± 0.00|67.25 ± 3.56|73.75 ± 5.45| 83.50 ± 8.20 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-large-play-v2|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|20.75 ± 7.26|42.00 ± 4.53| 52.25 ± 29.01 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-large-diverse-v2|0.00 ± 0.00|0.75 ± 0.83|0.00 ± 0.00|0.00 ± 0.00|20.50 ± 13.24|30.25 ± 3.63| 64.00 ± 5.43 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+| **antmaze average** | 17.21|19.71|19.33|18.58|50.71|57.17| **78.42** |0.00|0.00|18.12|
+
+### Adroit
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|pen-human-v1|71.03 ± 6.26|26.99 ± 9.60|-3.88 ± 0.21|76.65 ± 11.71|13.71 ± 16.98|78.49 ± 8.21| 103.16 ± 8.49 |6.86 ± 5.93|5.07 ± 6.16|67.68 ± 5.48|
+|pen-cloned-v1|51.92 ± 15.15|46.67 ± 14.25|5.13 ± 5.28|85.72 ± 16.92|1.04 ± 6.62|83.42 ± 8.19| 102.79 ± 7.84 |31.35 ± 2.14|12.02 ± 1.75|64.43 ± 1.43|
+|pen-expert-v1|109.65 ± 7.28|114.96 ± 2.96|122.53 ± 21.27|159.91 ± 1.87|-1.41 ± 2.34|128.05 ± 9.21| 152.16 ± 6.33 |87.11 ± 48.95|-1.55 ± 0.81|116.38 ± 1.27|
+|door-human-v1|2.34 ± 4.00|-0.13 ± 0.07|-0.33 ± 0.01|2.39 ± 2.26|5.53 ± 1.31|3.26 ± 1.83| -0.10 ± 0.01 |-0.38 ± 0.00|-0.12 ± 0.13|4.44 ± 0.87|
+|door-cloned-v1|-0.09 ± 0.03|0.29 ± 0.59|-0.34 ± 0.01|-0.01 ± 0.01|-0.33 ± 0.01|3.07 ± 1.75| 0.06 ± 0.05 |-0.33 ± 0.00|2.66 ± 2.31|7.64 ± 3.26|
+|door-expert-v1|105.35 ± 0.09|104.04 ± 1.46|-0.33 ± 0.01|104.57 ± 0.31|-0.32 ± 0.02|106.65 ± 0.25| 106.37 ± 0.29 |-0.33 ± 0.00|106.29 ± 1.73|104.87 ± 0.39|
+|hammer-human-v1|3.03 ± 3.39|-0.19 ± 0.02|1.02 ± 0.24|1.01 ± 0.51|0.14 ± 0.11|1.79 ± 0.80| 0.24 ± 0.24 |0.24 ± 0.00|0.28 ± 0.18|1.28 ± 0.15|
+|hammer-cloned-v1|0.55 ± 0.16|0.12 ± 0.08|0.25 ± 0.01|1.27 ± 2.11|0.30 ± 0.01|1.50 ± 0.69| 5.00 ± 3.75 |0.14 ± 0.09|0.19 ± 0.07|1.82 ± 0.55|
+|hammer-expert-v1|126.78 ± 0.64|121.75 ± 7.67|3.11 ± 0.03|127.08 ± 0.13|0.26 ± 0.01|128.68 ± 0.33| 133.62 ± 0.27 |25.13 ± 43.25|28.52 ± 49.00|117.45 ± 6.65|
+|relocate-human-v1|0.04 ± 0.03|-0.14 ± 0.08|-0.29 ± 0.01|0.45 ± 0.53|0.06 ± 0.03|0.12 ± 0.04| 0.16 ± 0.30 |-0.31 ± 0.01|-0.17 ± 0.17|0.05 ± 0.01|
+|relocate-cloned-v1|-0.06 ± 0.01|-0.00 ± 0.02|-0.30 ± 0.01|-0.01 ± 0.03|-0.29 ± 0.01|0.04 ± 0.01| 1.66 ± 2.59 |-0.01 ± 0.10|0.17 ± 0.35|0.16 ± 0.09|
+|relocate-expert-v1|107.58 ± 1.20|97.90 ± 5.21|-1.73 ± 0.96|109.52 ± 0.47|-0.30 ± 0.02|106.11 ± 4.02| 107.52 ± 2.28 |-0.36 ± 0.00|71.94 ± 18.37|104.28 ± 0.42|
+| | | | | | | | | | | |
+| **adroit average** | 48.18|42.69|10.40|55.71|1.53|53.43| **59.39** |12.43|18.78|49.21|
+
+## Best Scores
+### Gym-MuJoCo
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N| EDAC |DT|
+|--------------------|------------|--------|--------|--------|-----|-----|------|-------|---------------|----|
+|halfcheetah-medium-v2|43.60 ± 0.14|43.90 ± 0.13|48.93 ± 0.11|50.81 ± 0.15|47.62 ± 0.03|48.84 ± 0.07|65.62 ± 0.46|72.21 ± 0.31| 69.72 ± 0.92 |42.73 ± 0.10|
+|halfcheetah-medium-replay-v2|40.52 ± 0.19|42.27 ± 0.46|45.84 ± 0.26|46.47 ± 0.26|46.43 ± 0.19|45.35 ± 0.08|52.22 ± 0.31|67.29 ± 0.34| 66.55 ± 1.05 |40.31 ± 0.28|
+|halfcheetah-medium-expert-v2|79.69 ± 3.10|94.11 ± 0.22|96.59 ± 0.87|96.83 ± 0.23|97.04 ± 0.17|95.38 ± 0.17|108.89 ± 1.20|111.73 ± 0.47| 110.62 ± 1.04 |93.40 ± 0.21|
+|hopper-medium-v2|69.04 ± 2.90|73.84 ± 0.37|70.44 ± 1.18|95.42 ± 3.67|70.80 ± 1.98|80.46 ± 3.09|103.19 ± 0.16|101.79 ± 0.20| 103.26 ± 0.14 |69.42 ± 3.64|
+|hopper-medium-replay-v2|68.88 ± 10.33|90.57 ± 2.07|98.12 ± 1.16|101.47 ± 0.23|101.63 ± 0.55|102.69 ± 0.96|102.57 ± 0.45|103.83 ± 0.53| 103.28 ± 0.49 |88.74 ± 3.02|
+|hopper-medium-expert-v2|90.63 ± 10.98|113.13 ± 0.16|113.22 ± 0.43|113.26 ± 0.49|112.84 ± 0.66|113.18 ± 0.38|113.16 ± 0.43|111.24 ± 0.15| 111.80 ± 0.11 |111.18 ± 0.21|
+|walker2d-medium-v2|80.64 ± 0.91|82.05 ± 0.93|86.91 ± 0.28|85.86 ± 3.76|84.77 ± 0.20|87.58 ± 0.48|87.79 ± 0.19|90.17 ± 0.54| 95.78 ± 1.07 |74.70 ± 0.56|
+|walker2d-medium-replay-v2|48.41 ± 7.61|76.09 ± 0.40|91.17 ± 0.72|86.70 ± 0.94|89.39 ± 0.88|89.94 ± 0.93|91.11 ± 0.63|85.18 ± 1.63| 89.69 ± 1.39 |68.22 ± 1.20|
+|walker2d-medium-expert-v2|109.95 ± 0.62|109.90 ± 0.09|112.21 ± 0.06|113.40 ± 2.22|111.63 ± 0.38|113.06 ± 0.53|112.49 ± 0.18|116.93 ± 0.42| 116.52 ± 0.75 |108.71 ± 0.34|
+| **locomotion average** | 70.15|80.65|84.83|87.80|84.68|86.28|93.00|95.60| **96.36** |77.49|
+
+
+### Maze2d
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|maze2d-umaze-v1|16.09 ± 0.87|22.49 ± 1.52|99.33 ± 16.16|136.96 ± 10.89|92.05 ± 13.66|50.92 ± 4.23| 162.28 ± 1.79 |153.12 ± 6.49|149.88 ± 1.97|63.83 ± 17.35|
+|maze2d-medium-v1|19.16 ± 1.24|27.64 ± 1.87|150.93 ± 3.89|152.73 ± 20.78|128.66 ± 5.44|122.69 ± 30.00| 150.12 ± 4.48 |93.80 ± 14.66|154.41 ± 1.58|68.14 ± 12.25|
+|maze2d-large-v1|20.75 ± 6.66|41.83 ± 3.64|197.64 ± 5.26|227.31 ± 1.47|157.51 ± 7.32|162.25 ± 44.18| 197.55 ± 5.82 |207.51 ± 0.96|182.52 ± 2.68|50.25 ± 19.34|
+| | | | | | | | | | | |
+| **maze2d average** | 18.67|30.65|149.30|172.33|126.07|111.95| **169.98** |151.48|162.27|60.74|
+
+### Antmaze
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|antmaze-umaze-v2|68.50 ± 2.29|77.50 ± 1.50|98.50 ± 0.87|70.75 ± 8.84|94.75 ± 0.83|84.00 ± 4.06| 100.00 ± 0.00 |0.00 ± 0.00|42.50 ± 28.61|64.50 ± 2.06|
+|antmaze-umaze-diverse-v2|64.75 ± 4.32|63.50 ± 2.18|71.25 ± 5.76|81.50 ± 4.27|53.75 ± 2.05|79.50 ± 3.35| 96.75 ± 2.28 |0.00 ± 0.00|0.00 ± 0.00|60.50 ± 2.29|
+|antmaze-medium-play-v2|4.50 ± 1.12|6.25 ± 2.38|3.75 ± 1.30|25.00 ± 10.70|80.50 ± 3.35|78.50 ± 3.84| 93.50 ± 2.60 |0.00 ± 0.00|0.00 ± 0.00|0.75 ± 0.43|
+|antmaze-medium-diverse-v2|4.75 ± 1.09|16.50 ± 5.59|5.50 ± 1.50|10.75 ± 5.31|71.00 ± 4.53|83.50 ± 1.80| 91.75 ± 2.05 |0.00 ± 0.00|0.00 ± 0.00|0.50 ± 0.50|
+|antmaze-large-play-v2|0.50 ± 0.50|13.50 ± 9.76|1.25 ± 0.43|0.50 ± 0.50|34.75 ± 5.85|53.50 ± 2.50| 68.75 ± 13.90 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-large-diverse-v2|0.75 ± 0.43|6.25 ± 1.79|0.25 ± 0.43|0.00 ± 0.00|36.25 ± 3.34|53.00 ± 3.00| 69.50 ± 7.26 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+| **antmaze average** |23.96|30.58|30.08|31.42|61.83|72.00| **86.71** |0.00|7.08|21.04|
+
+### Adroit
+| **Task-Name** |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|pen-human-v1|99.69 ± 7.45|59.89 ± 8.03|9.95 ± 8.19|119.03 ± 6.55|58.91 ± 1.81|106.15 ± 10.28| 127.28 ± 3.22 |56.48 ± 7.17|35.84 ± 10.57|77.83 ± 2.30|
+|pen-cloned-v1|99.14 ± 12.27|83.62 ± 11.75|52.66 ± 6.33|125.78 ± 3.28|14.74 ± 2.31|114.05 ± 4.78| 128.64 ± 7.15 |52.69 ± 5.30|26.90 ± 7.85|71.17 ± 2.70|
+|pen-expert-v1|128.77 ± 5.88|134.36 ± 3.16|142.83 ± 7.72|162.53 ± 0.30|14.86 ± 4.07|140.01 ± 6.36| 157.62 ± 0.26 |116.43 ± 40.26|36.04 ± 4.60|119.49 ± 2.31|
+|door-human-v1|9.41 ± 4.55|7.00 ± 6.77|-0.11 ± 0.06|17.70 ± 2.55|13.28 ± 2.77|13.52 ± 1.22| 0.27 ± 0.43 |-0.10 ± 0.06|2.51 ± 2.26|7.36 ± 1.24|
+|door-cloned-v1|3.40 ± 0.95|10.37 ± 4.09|-0.20 ± 0.11|10.53 ± 2.82|-0.08 ± 0.13|9.02 ± 1.47| 7.73 ± 6.80 |-0.21 ± 0.10|20.36 ± 1.11|11.18 ± 0.96|
+|door-expert-v1|105.84 ± 0.23|105.92 ± 0.24|4.49 ± 7.39|106.60 ± 0.27|59.47 ± 25.04|107.29 ± 0.37| 106.78 ± 0.04 |0.05 ± 0.02|109.22 ± 0.24|105.49 ± 0.09|
+|hammer-human-v1|12.61 ± 4.87|6.23 ± 4.79|2.38 ± 0.14|16.95 ± 3.61|0.30 ± 0.05|6.86 ± 2.38| 1.18 ± 0.15 |0.25 ± 0.00|3.49 ± 2.17|1.68 ± 0.11|
+|hammer-cloned-v1|8.90 ± 4.04|8.72 ± 3.28|0.96 ± 0.30|10.74 ± 5.54|0.32 ± 0.03|11.63 ± 1.70| 48.16 ± 6.20 |12.67 ± 15.02|0.27 ± 0.01|2.74 ± 0.22|
+|hammer-expert-v1|127.89 ± 0.57|128.15 ± 0.66|33.31 ± 47.65|129.08 ± 0.26|0.93 ± 1.12|129.76 ± 0.37| 134.74 ± 0.30 |91.74 ± 47.77|69.44 ± 47.00|127.39 ± 0.10|
+|relocate-human-v1|0.59 ± 0.27|0.16 ± 0.14|-0.29 ± 0.01|1.77 ± 0.84|1.03 ± 0.20|1.22 ± 0.28| 3.70 ± 2.34 |-0.18 ± 0.14|0.05 ± 0.02|0.08 ± 0.02|
+|relocate-cloned-v1|0.45 ± 0.31|0.74 ± 0.45|-0.02 ± 0.04|0.39 ± 0.13|-0.07 ± 0.02|1.78 ± 0.70| 9.25 ± 2.56 |0.10 ± 0.04|4.11 ± 1.39|0.34 ± 0.09|
+|relocate-expert-v1|110.31 ± 0.36|109.77 ± 0.60|0.23 ± 0.27|111.21 ± 0.32|0.03 ± 0.10|110.12 ± 0.82| 111.14 ± 0.23 |-0.07 ± 0.08|98.32 ± 3.75|106.49 ± 0.30|
+| **adroit average** | 58.92|54.58|20.51|67.69|13.65|62.62| **69.71** |27.49|33.88|52.60|
+
+## Visual summary
+
+![](../assets/perf_profiles_offline.pdf)
\ No newline at end of file
diff --git a/docs/benchmarks/repro.md b/docs/benchmarks/repro.md
new file mode 100644
index 00000000..b90758fa
--- /dev/null
+++ b/docs/benchmarks/repro.md
@@ -0,0 +1,30 @@
+# How to Reproduce
+
+To reproduce all figures and tables from our [technical paper](https://arxiv.org/abs/2210.07105), do the following steps.
+
+## Collect wandb logs
+
+These scripts collect all wandb logs into .csv files and save them into the `runs_tables` folder.
+We provide the tables, but you can recollect them.
+```python
+python results/get_offline_urls.py
+python results/get_finetune_urls.py
+```
+
+## Collect scores
+
+These scripts collect data from runs kept in .csv files and save evaluation scores (and regret in case of offline-to-online)
+into pickled files, which are stored in the `bin` folder. We provide the pickled data, but if you need to extract more data,
+you can modify scripts for your purposes.
+```python
+python results/get_offline_scores.py
+python results/get_finetune_scores.py
+```
+
+## Print tables
+
+ These scripts use pickled data, print all the tables, and save all figures into the `out` directory.
+```python
+python results/get_offline_tables_and_plots.py
+python results/get_finetune_tables_and_plots.py
+```
\ No newline at end of file
diff --git a/docs/community/contrib.md b/docs/community/contrib.md
new file mode 100644
index 00000000..1f91155b
--- /dev/null
+++ b/docs/community/contrib.md
@@ -0,0 +1,116 @@
+# Contribution
+
+## Contributing to the codebase
+
+We welcome:
+
+- Bug reports
+- Pull requests for bug fixes
+- Logs and documentation improvements
+- New algorithms and datasets
+- Better hyperparameters (but with proofs)
+
+### Setup
+
+Contributing code is done through standard github methods:
+
+1. Fork this repo
+2. Make a change and commit your code
+3. Submit a pull request. It will be reviewed by maintainers, and they'll give feedback or make requests as applicable
+
+```commandline
+git clone git@github.com:tinkoff-ai/CORL.git
+cd CORL
+pip install -r requirements/requirements_dev.txt
+```
+
+For dependencies installation see [get started section](../get-started/install.md).
+
+### Code style
+
+The CI will run several checks on the new code pushed to the CORL repository.
+These checks can also be run locally without waiting for the CI by following the steps below:
+
+
+1. [install `pre-commit`](https://pre-commit.com/#install),
+2. install the Git hooks by running `pre-commit install`.
+
+Once those two steps are done, the Git hooks will be run automatically at every new commit.
+The Git hooks can also be run manually with `pre-commit run --all-files`, and
+if needed they can be skipped (not recommended) with `git commit --no-verify`.
+
+We use [Ruff](https://github.com/astral-sh/ruff) as our main linter. If you want to see possible
+problems before pre-commit, you can run `ruff check --diff .` to see exact linter suggestions and future fixes.
+
+## Adding new algorithms
+
+!!! warning
+ While we welcome any algorithms, it is better to open an issue with the proposal before
+ so we can discuss the details. Unfortunately, not all algorithms are equally
+ easy to understand and reproduce. We may be able to give a couple of advices to you,
+ or on the contrary warn you that this particular algorithm will require too much
+ computational resources to fully reproduce the results, and it is better to do something else.
+
+
+All new algorithms should go to the `algorithms/contrib/offline` for just
+offline algorithms and to the `algorithms/contrib/finetune` for the offline-to-online algorithms.
+
+We as a team try to keep the core as reliable and reproducible as possible,
+but we may not have the resources to support all future algorithms.
+Therefore, this separation is necessary, as we cannot guarantee that all
+algorithms from `algorithms/contrib` exactly reproduce the results of their original publications.
+
+Make sure your new code is properly documented and all references to the original implementations and papers are present
+(for example as in [Decision Transformer](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)).
+Follow the conventions for naming argument of configs, functions, classes. Try to stylistically imitate already existing implementations.
+
+Please, **explain all the tricks and possible differences from the original implementation in as much detail as possible**.
+Keep in mind that this code may be used by other researchers. Make their lives easier!
+
+### Running benchmarks
+
+Although you will have to do a hyperparameter search while reproducing the algorithm,
+in the end we expect to see final configs in `configs/contrib///.yaml` with the best hyperparameters for all
+datasets considered. The configs should be in `yaml` format, containing all hyperparameters sorted
+in alphabetical order (see existing configs for an inspiration).
+
+Use these conventions to name your runs in the configs:
+1. `name: `
+2. `group: --multiseed-v0`, increment version if needed
+3. use our [\_\_post_init\_\_](https://github.com/tinkoff-ai/CORL/blob/962688b405f579a1ce6ec1b57e6369aaf76f9e69/algorithms/offline/awac.py#L48) implementation in your config dataclass
+
+Since we are releasing wandb logs for all algorithms, you will need to submit multiseed (~4 seeds)
+training runs the `CORL` project in the wandb [corl-team](https://wandb.ai/corl-team) organization. We'll invite you there when the time will come.
+
+We usually use wandb sweeps for this. You can use this example config (it will work with pyrallis as it expects `config_path` cli argument):
+```yaml title="sweep_config.yaml"
+entity: corl-team
+project: CORL
+program: algorithms/contrib/.py
+method: grid
+parameters:
+ config_path:
+ # algo_type is offline or finetune (see sections above)
+ values: [
+ "configs/contrib///.yaml",
+ "configs/contrib///.yaml",
+ "configs/contrib///.yaml",
+ ]
+ train_seed:
+ values: [0, 1, 2, 3]
+```
+Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent `.
+
+Based on the results, you will need to make wandb reports to make it easier for other users to understand.
+You can use any of the already existing ones as an example (see [README.md](https://github.com/corl-team/CORL/tree/main)).
+
+### Checklist
+
+Ideally, all checks should be completed!
+
+- [ ] Issue about new algorithm is open
+- [ ] Single-file implementation is added to the `algorithms/contrib`
+- [ ] PR has passed all the tests
+- [ ] Evidence that implementation reproduces original results is provided
+- [ ] Configs with the best hyperparameters for all datasets are added to the `configs/contrib`
+- [ ] Logs and reports for best hyperparameters are submitted to our wandb organization
diff --git a/docs/community/publications.md b/docs/community/publications.md
new file mode 100644
index 00000000..cabc4960
--- /dev/null
+++ b/docs/community/publications.md
@@ -0,0 +1,28 @@
+# List of Publications
+
+!!! tip
+ Please open a pull request to add missing entries!
+
+List of publications that are using CORL algorithms or benchmarked results:
+
+- Lu, C., Ball, P. J., & Parker-Holder, J. Synthetic Experience Replay.
+- Beeson, A., & Montana, G. (2023). Balancing policy constraint and ensemble size in uncertainty-based offline reinforcement learning. arXiv preprint arXiv:2303.14716.
+- Nikulin, A., Kurenkov, V., Tarasov, D., & Kolesnikov, S. (2023). Anti-exploration by random network distillation. arXiv preprint arXiv:2301.13616.
+- Bhargava, P., Chitnis, R., Geramifard, A., Sodhani, S., & Zhang, A. (2023). Sequence Modeling is a Robust Contender for Offline Reinforcement Learning. arXiv preprint arXiv:2305.14550.
+- Hu, X., Ma, Y., Xiao, C., Zheng, Y., & Meng, Z. (2023). In-Sample Policy Iteration for Offline Reinforcement Learning. arXiv preprint arXiv:2306.05726.
+- Lian, S., Ma, Y., Liu, J., Zheng, Y., & Meng, Z. (2023). HIPODE: Enhancing Offline Reinforcement Learning with High-Quality Synthetic Data from a Policy-Decoupled Approach. arXiv preprint arXiv:2306.06329.
+- He, H., Bai, C., Xu, K., Yang, Z., Zhang, W., Wang, D., ... & Li, X. (2023). Diffusion Model is an Effective Planner and Data Synthesizer for Multi-Task Reinforcement Learning. arXiv preprint arXiv:2305.18459.
+- Liu, J., Ma, Y., Hao, J., Hu, Y., Zheng, Y., Lv, T., & Fan, C. (2023). Prioritized Trajectory Replay: A Replay Memory for Data-driven Reinforcement Learning. arXiv preprint arXiv:2306.15503.
+- Chitnis, R., Xu, Y., Hashemi, B., Lehnert, L., Dogan, U., Zhu, Z., & Delalleau, O. (2023). IQL-TD-MPC: Implicit Q-Learning for Hierarchical Model Predictive Control. arXiv preprint arXiv:2306.00867.
+- Kurenkov, V., Nikulin, A., Tarasov, D., & Kolesnikov, S. (2023). Katakomba: Tools and Benchmarks for Data-Driven NetHack. arXiv preprint arXiv:2306.08772.
+- Lian, S., Ma, Y., Liu, J., Jianye, H. A. O., Zheng, Y., & Meng, Z. (2023, July). A Policy-Decoupled Method for High-Quality Data Augmentation in Offline Reinforcement Learning. In ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems.
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/get-started/install.md b/docs/get-started/install.md
new file mode 100644
index 00000000..3b788d3d
--- /dev/null
+++ b/docs/get-started/install.md
@@ -0,0 +1,38 @@
+# Installation
+
+## Manual
+!!! warning
+ Unfortunately, installing all dependencies can cause some difficulties at the moment, mainly due to **D4RL** and
+ the old version of mujoco it is locked to. It will be much easier in the future after migration to the **Minari** is done.
+
+All necessary dependencies are specified in the [`requirements/requirements.txt`](https://github.com/corl-team/CORL/blob/main/requirements/requirements.txt) file.
+You can just clone the repo and install all dependencies with pip:
+```commandline
+git clone https://github.com/corl-team/CORL.git
+cd CORL
+pip install -r requirements/requirements.txt
+```
+
+In addition to those specified there, the dependencies required by D4RL, namely MuJoCo binaries, must also be installed.
+We recommend following the official guide from [**mujoco-py**](https://github.com/openai/mujoco-py). You will need to download
+MuJoCo 2.1 binaries and extract downloaded `mujoco210` directory to the `~/.mujoco/mujoco210`:
+```commandline
+mkdir -p ~/.mujoco \
+ && wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz \
+ && tar -xf mujoco.tar.gz -C ~/.mujoco \
+ && rm mujoco.tar.gz
+export LD_LIBRARY_PATH=~/.mujoco/mujoco210/bin:${LD_LIBRARY_PATH}
+```
+If you have any problems with the installation, we advise you to first look for similar issues in the
+original [**D4RL**](https://github.com/Farama-Foundation/D4RL) and [**mujoco-py**](https://github.com/openai/mujoco-py) repositories.
+Most likely problem is in **D4RL**, not in **CORL** :smile:
+
+## Docker
+
+To simplify installation and improve reproducibility, we provide a preconfigured
+[Dockerfile](https://github.com/corl-team/CORL/blob/main/Dockerfile) that you can use:
+```bash
+cd CORL
+docker build -t corl .
+docker run --gpus=all -it --rm --name corl-container corl
+```
\ No newline at end of file
diff --git a/docs/get-started/usage.md b/docs/get-started/usage.md
new file mode 100644
index 00000000..e65b71c0
--- /dev/null
+++ b/docs/get-started/usage.md
@@ -0,0 +1,157 @@
+# Basic Usage
+
+![corl_tldr](../assets/corl.pdf)
+
+## How to Train
+
+We use [pyrallis](https://github.com/eladrich/pyrallis) for the configuration, thus after the dependencies have been installed,
+there are two ways to run the CORL algorithms:
+
+1. Manually specifying all the arguments within the terminal (they will overwrite the default ones):
+```commandline
+python algorithms/offline/dt.py \
+ --project="CORL-Test" \
+ --group="DT-Test" \
+ --name="dt-testing-run" \
+ --env_name="halfcheetah-medium-v2" \
+ --device="cuda:0"
+ # etc...
+```
+
+2. With yaml config. First, create yaml file with all needed hyperparameters:
+```yaml title="dt_example_config.yaml"
+# taken from https://github.com/corl-team/CORL/blob/main/configs/offline/dt/halfcheetah/medium_v2.yaml
+attention_dropout: 0.1
+batch_size: 4096
+betas:
+- 0.9
+- 0.999
+checkpoints_path: null
+clip_grad: 0.25
+deterministic_torch: false
+device: cuda
+embedding_dim: 128
+embedding_dropout: 0.1
+env_name: "halfcheetah-medium-v2"
+episode_len: 1000
+eval_episodes: 100
+eval_every: 5000
+eval_seed: 42
+group: "dt-halfcheetah-medium-v2-multiseed-v2"
+learning_rate: 0.0008
+max_action: 1.0
+name: "DT"
+num_heads: 1
+num_layers: 3
+num_workers: 4
+project: "CORL"
+residual_dropout: 0.1
+reward_scale: 0.001
+seq_len: 20
+target_returns: [12000.0, 6000.0]
+train_seed: 10
+update_steps: 100000
+warmup_steps: 10000
+weight_decay: 0.0001
+```
+After that we can supply all hyperparameters from config with `config_path` argument:
+```commandline
+python algorithms/offline/dt.py \
+ --config_path="dt_example_config.yaml"
+ # you can also overwrite any hyperparameter if needed
+ --device="cuda:0"
+ # etc...
+```
+By default, training script will log metrics to the wandb project specified by the `group` argument.
+If you want to disable logging, run `wandb disabled` or `wandb offline`. To turn it back on, run `wandb online`.
+For more options see [wandb documentation](https://docs.wandb.ai/guides/technical-faq/general#can-i-disable-wandb-when-testing-my-code).
+
+ If you're not familiar with [Weights & Biases](https://wandb.ai/site) logging tools, it is better to first familiarize
+ yourself with the basics [here](https://docs.wandb.ai/quickstart).
+
+ For an explanation of all logged metrics, refer to the documentation of the specific algorithm.
+
+## CLI Documentation
+
+How to find out all available hyperparameters and their brief explanation? Very simple, just run `python algorithms/offline/dt.py --help` (this will work for all algorithms):
+```commandline
+usage: dt.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--embedding_dim int] [--num_layers int]
+ [--num_heads int] [--seq_len int] [--episode_len int] [--attention_dropout float] [--residual_dropout float]
+ [--embedding_dropout float] [--max_action float] [--env_name str] [--learning_rate float]
+ [--betas float float] [--weight_decay float] [--clip_grad [float]] [--batch_size int] [--update_steps int]
+ [--warmup_steps int] [--reward_scale float] [--num_workers int] [--target_returns float [float, ...]]
+ [--eval_episodes int] [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool]
+ [--train_seed int] [--eval_seed int] [--device str]
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config_path str Path for a config file to parse with pyrallis (default: None)
+
+TrainConfig:
+
+ --project str wandb project name (default: CORL)
+ --group str wandb group name (default: DT-D4RL)
+ --name str wandb run name (default: DT)
+ --embedding_dim int transformer hidden dim (default: 128)
+ --num_layers int depth of the transformer model (default: 3)
+ --num_heads int number of heads in the attention (default: 1)
+ --seq_len int maximum sequence length during training (default: 20)
+ --episode_len int maximum rollout length, needed for the positional embeddings (default: 1000)
+ --attention_dropout float
+ attention dropout (default: 0.1)
+ --residual_dropout float
+ residual dropout (default: 0.1)
+ --embedding_dropout float
+ embeddings dropout (default: 0.1)
+ --max_action float maximum range for the symmetric actions, [-1, 1] (default: 1.0)
+ --env_name str training dataset and evaluation environment (default: halfcheetah-medium-v2)
+ --learning_rate float
+ AdamW optimizer learning rate (default: 0.0001)
+ --betas float float AdamW optimizer betas (default: (0.9, 0.999))
+ --weight_decay float AdamW weight decay (default: 0.0001)
+ --clip_grad [float] maximum gradient norm during training, optional (default: 0.25)
+ --batch_size int training batch size (default: 64)
+ --update_steps int total training steps (default: 100000)
+ --warmup_steps int warmup steps for the learning rate scheduler (increasing from zero to learning_rate) (default:
+ 10000)
+ --reward_scale float reward scaling, to reduce the magnitude (default: 0.001)
+ --num_workers int number of workers for the pytorch dataloader (default: 4)
+ --target_returns float [float, ...]
+ target return-to-go for the prompting durint evaluation (default: (12000.0, 6000.0))
+ --eval_episodes int number of episodes to run during evaluation (default: 100)
+ --eval_every int evaluation frequency, will evaluate eval_every training steps (default: 10000)
+ --checkpoints_path [str]
+ path for checkpoints saving, optional (default: None)
+ --deterministic_torch bool
+ configure PyTorch to use deterministic algorithms instead of nondeterministic ones where available
+ (default: False)
+ --train_seed int training random seed (default: 10)
+ --eval_seed int evaluation random seed (default: 42)
+ --device str training device (default: cuda)
+```
+
+## Benchmarking
+
+Sooner or later you will probably want to run many experiments at once, for example to search for hyperparameters,
+or to do multi-seed training for some datasets. For something like this we recommend using wandb sweeps (and we use them ourselves).
+The general recipe looks like this. First, create wandb seep config:
+```yaml title="sweep_config.yaml"
+entity: corl-team
+project: CORL
+program: algorithms/offline/dt.py
+method: grid
+parameters:
+ # specify all configs to run for the choosen algorithm
+ config_path:
+ values: [
+ "configs/offline/dt/halfcheetah/medium_v2.yaml",
+ "configs/offline/dt/halfcheetah/medium_replay_v2.yaml",
+ "configs/offline/dt/halfcheetah/medium_expert_v2.yaml",
+ ]
+ train_seed:
+ values: [0, 1, 2, 3]
+```
+Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent `.
+This will train multiple seeds for each config.
+
+All configs with full hyperparameters for all datasets and algorithms are in [`configs`](https://github.com/corl-team/CORL/tree/main/configs).
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..cb337970
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,77 @@
+---
+hide:
+ - toc # Hide table of contents
+---
+
+# CORL (Clean Offline Reinforcement Learning)
+
+[![Twitter](https://badgen.net/badge/icon/twitter?icon=twitter&label)](https://twitter.com/vladkurenkov/status/1669361090550177793)
+[![arXiv](https://img.shields.io/badge/arXiv-2210.07105-b31b1b.svg)](https://arxiv.org/abs/2210.07105)
+[](https://github.com/tinkoff-ai/CORL/blob/main/LICENSE)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+
+🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations
+of SOTA **offline reinforcement learning** algorithms. Each implementation is backed by a research-friendly codebase, allowing
+you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL,
+check them out too! The highlight features of CORL are:
+
+* 📜 Single-file implementation
+* 📈 Benchmarked Implementation (11+ offline algorithms, 5+ offline-to-online algorithms, 30+ datasets with detailed logs :material-arm-flex:)
+* 🖼 [Weights and Biases](https://wandb.ai/site) integration
+
+You can read more about CORL design and main results in our [technical paper](https://arxiv.org/abs/2210.07105).
+
+
+!!! tip
+ ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/corl-team/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment.
+
+
+!!! info
+ **Minari** and **Gymnasium** support: [Farama-Foundation/Minari](https://github.com/Farama-Foundation/Minari) is the
+ next generation of D4RL that will continue to be maintained and introduce new features and datasets.
+ Please see their [announcement](https://farama.org/Announcing-Minari) for further detail.
+ We are currently slowly migrating to the Minari and the progress
+ can be tracked [here](https://github.com/corl-team/CORL/issues/2). This will allow us to significantly update dependencies
+ and simplify installation, and give users access to many new datasets out of the box!
+
+
+!!! warning
+ CORL (similarily to CleanRL) is not a modular library and therefore it is not meant to be imported.
+ At the cost of duplicate code, we make all implementation details of an ORL algorithm variant easy
+ to understand. You should consider using CORL if you want to 1) understand and control all implementation details
+ of an algorithm or 2) rapidly prototype advanced features that other modular ORL libraries do not support.
+
+
+## Algorithms Implemented
+
+| Algorithm | Variants Implemented | Wandb Report |
+|--------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- |
+| **Offline and Offline-to-Online** | |
+| ✅ [Conservative Q-Learning for Offline Reinforcement Learning
(CQL)](https://arxiv.org/abs/2006.04779) | :material-github: [`offline/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/cql.py)
:material-github: [`finetune/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cql.py)
:material-file-document: [docs](algorithms/cql.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-CQL--VmlldzoyNzA2MTk5)
:material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-CQL--Vmlldzo0NTQ3NTMz)
+| ✅ [Accelerating Online Reinforcement Learning with Offline Datasets
(AWAC)](https://arxiv.org/abs/2006.09359) | :material-github: [`offline/awac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/awac.py)
:material-github: [`finetune/awac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/awac.py)
:material-file-document: [docs](algorithms/awac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-AWAC--VmlldzoyNzA2MjE3)
:material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-AWAC--VmlldzozODAyNzQz)
+| ✅ [Offline Reinforcement Learning with Implicit Q-Learning
(IQL)](https://arxiv.org/abs/2110.06169) | :material-github: [`offline/iql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/iql.py)
:material-github: [`finetune/iql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/iql.py)
:material-file-document: [docs](algorithms/iql.md) |:material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-IQL--VmlldzoyNzA2MTkx)
:material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-IQL--VmlldzozNzE1MTEy)
+| **Offline-to-Online only** | |
+| ✅ [Supported Policy Optimization for Offline Reinforcement Learning
(SPOT)](https://arxiv.org/abs/2202.06239) | :material-github: [`finetune/spot.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/spot.py)
:material-file-document: [docs](algorithms/spot.md) | :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-SPOT--VmlldzozODk5MTgx)
+| ✅ [Cal-QL: Calibrated Offline RL Pre-Training for Efficient Online Fine-Tuning
(Cal-QL)](https://arxiv.org/abs/2303.05479) | :material-github: [`finetune/cal_ql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cal_ql.py)
:material-file-document: [docs](algorithms/cal-ql.md) | :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-Cal-QL--Vmlldzo0NTQ3NDk5)
+| **Offline only** | |
+| ✅ Behavioral Cloning
(BC) | :material-github: [`offline/any_percent_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/any_percent_bc.py)
:material-file-document: [docs](algorithms/bc.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC--VmlldzoyNzA2MjE1)
+| ✅ Behavioral Cloning-10%
(BC-10%) | :material-github: [`offline/any_percent_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/any_percent_bc.py)
:material-file-document: [docs](algorithms/bc.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC-10---VmlldzoyNzEwMjcx)
+| ✅ [A Minimalist Approach to Offline Reinforcement Learning
(TD3+BC)](https://arxiv.org/abs/2106.06860) | :material-github: [`offline/td3_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/td3_bc.py)
:material-file-document: [docs](algorithms/td3-bc.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-TD3-BC--VmlldzoyNzA2MjA0)
+| ✅ [Decision Transformer: Reinforcement Learning via Sequence Modeling
(DT)](https://arxiv.org/abs/2106.01345) | :material-github: [`offline/dt.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)
:material-file-document: [docs](algorithms/dt.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-Decision-Transformer--VmlldzoyNzA2MTk3)
+| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(SAC-N)](https://arxiv.org/abs/2110.01548) | :material-github: [`offline/sac_n.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/sac_n.py)
:material-file-document: [docs](algorithms/sac-n.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-SAC-N--VmlldzoyNzA1NTY1)
+| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble
(EDAC)](https://arxiv.org/abs/2110.01548) | :material-github: [`offline/edac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/edac.py)
:material-file-document: [docs](algorithms/edac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-EDAC--VmlldzoyNzA5ODUw)
+| ✅ [Revisiting the Minimalist Approach to Offline Reinforcement Learning
(ReBRAC)](https://arxiv.org/abs/2305.09836) | :material-github: [`offline/rebrac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/rebrac.py)
:material-file-document: [docs](algorithms/rebrac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-ReBRAC--Vmlldzo0ODkzOTQ2)
+| ✅ [Q-Ensemble for Offline RL: Don't Scale the Ensemble, Scale the Batch Size
(LB-SAC)](https://arxiv.org/abs/2211.11092) | :material-github: [`offline/lb_sac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/lb_sac.py)
:material-file-document: [docs](algorithms/lb-sac.md) | :material-chart-box: [`Offline Gym-MuJoCo`](https://wandb.ai/tlab/CORL/reports/LB-SAC-D4RL-Results--VmlldzozNjIxMDY1)
+
+## Citing CORL
+If you use CORL in your work, please use the following bibtex
+```bibtex
+@inproceedings{
+tarasov2022corl,
+ title={CORL: Research-oriented Deep Offline Reinforcement Learning Library},
+ author={Denis Tarasov and Alexander Nikulin and Dmitry Akimov and Vladislav Kurenkov and Sergey Kolesnikov},
+ booktitle={3rd Offline RL Workshop: Offline RL as a ''Launchpad''},
+ year={2022},
+ url={https://openreview.net/forum?id=SyAS49bBcv}
+}
+```
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 00000000..bda5b719
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,85 @@
+site_name: Clean Offline RL
+theme:
+ name: material
+ logo: assets/logo.jpeg
+ palette:
+ # Palette toggle for light mode
+ - scheme: default
+ toggle:
+ icon: material/toggle-switch
+ name: Switch to dark mode
+
+ # Palette toggle for dark mode
+ - scheme: slate
+ toggle:
+ icon: material/toggle-switch-off-outline
+ name: Switch to light mode
+ features:
+ - navigation.instant
+ - navigation.tracking
+ - navigation.sections
+ - navigation.expand
+ - navigation.path
+# - toc.integrate
+ - navigation.top
+ - search.suggest
+ - search.highlight
+ - header.autohide
+ - content.code.copy
+ - content.code.annotate
+
+copyright: Copyright © 2022, CORL Team
+
+repo_url: https://github.com/corl-team/CORL
+repo_name: corl-team/CORL
+
+markdown_extensions:
+ - meta
+ - admonition
+ - pymdownx.details
+ - pymdownx.superfences
+ - pymdownx.emoji:
+ emoji_index: !!python/name:materialx.emoji.twemoji
+ emoji_generator: !!python/name:materialx.emoji.to_svg
+ - pymdownx.highlight:
+ anchor_linenums: true
+ line_spans: __span
+ pygments_lang_class: true
+ - pymdownx.inlinehilite
+ - pymdownx.snippets
+ - pymdownx.tasklist:
+ custom_checkbox: true
+ clickable_checkbox: false
+ - pymdownx.arithmatex:
+ generic: true
+
+extra_javascript:
+ - javascripts/mathjax.js
+ - https://polyfill.io/v3/polyfill.min.js?features=es6
+ - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
+
+nav:
+ - Overview: index.md
+ - Get Started:
+ - get-started/install.md
+ - get-started/usage.md
+ - Benchmarks:
+ - benchmarks/offline.md
+ - benchmarks/offline-to-online.md
+ - benchmarks/repro.md
+ - Algorithms:
+ - algorithms/bc.md
+ - algorithms/td3-bc.md
+ - algorithms/dt.md
+ - algorithms/sac-n.md
+ - algorithms/edac.md
+ - algorithms/rebrac.md
+ - algorithms/lb-sac.md
+ - algorithms/cql.md
+ - algorithms/awac.md
+ - algorithms/iql.md
+ - algorithms/cal-ql.md
+ - algorithms/spot.md
+ - Community:
+ - community/contrib.md
+ - community/publications.md
\ No newline at end of file
diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt
index e0e489b0..16d0cc05 100644
--- a/requirements/requirements_dev.txt
+++ b/requirements/requirements_dev.txt
@@ -10,6 +10,7 @@ torch==1.11.0+cu113
pyrallis==0.3.1
pre-commit==3.3.3
ruff==0.0.278
+mkdocs-material==9.1.21
--find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
jax==0.4.1
jaxlib[cuda11_cudnn82]==0.4.1
diff --git a/results/get_offline_scores.py b/results/get_offline_scores.py
index 0e165a97..fe5e8570 100644
--- a/results/get_offline_scores.py
+++ b/results/get_offline_scores.py
@@ -44,7 +44,11 @@ def process_runs(df):
df.iterrows(), desc="Runs scores downloading", position=0, leave=True
):
full_scores[row["algorithm"]][row["dataset"]].append(
- get_run_scores(row["url"], row["algorithm"] == "DT", row["algorithm"] == "AWAC")
+ get_run_scores(
+ row["url"],
+ row["algorithm"] == "DT",
+ row["algorithm"] == "AWAC"
+ )
)
return full_scores