diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..0481485d
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,25 @@
+name: ci
+on:
+  push:
+    branches:
+      - main
+      - howuhh/docs-wip
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v3
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: pip install mkdocs-material
+      - run: mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
index ce35dd32..03469487 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,4 +145,4 @@ dmypy.json
 .json
 .yaml
 wandb
-assets/
\ No newline at end of file
+#assets/
\ No newline at end of file
diff --git a/README.md b/README.md
index 2792adfc..36f74e2b 100644
--- a/README.md
+++ b/README.md
@@ -10,16 +10,25 @@
 🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations of SOTA ORL algorithms. Each implementation is backed by a research-friendly codebase, allowing you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL, check them out too!<br/>
 
 * 📜 Single-file implementation
-* 📈 Benchmarked Implementation for N algorithms
+* 📈 Benchmarked Implementation (11+ offline algorithms, 5+ offline-to-online algorithms, 30+ datasets with detailed logs)
 * 🖼 [Weights and Biases](https://wandb.ai/site) integration
 
+You can read more about CORL design and main results in our [technical paper](https://arxiv.org/abs/2210.07105).
+
 ----
 * ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/corl-team/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment.
 ----
 
+> ⚠️ **NOTE**: CORL (similarily to CleanRL) is not a modular library and therefore it is not meant to be imported.
+At the cost of duplicate code, we make all implementation details of an ORL algorithm variant easy 
+to understand. You should consider using CORL if you want to 1) understand and control all implementation details 
+of an algorithm or 2) rapidly prototype advanced features that other modular ORL libraries do not support.
+
 
 ## Getting started
 
+Please refer to the [documentation](https://corl-team.github.io/CORL/get-started/install/) for more details. TLDR:
+
 ```bash
 git clone https://github.com/corl-team/CORL.git && cd CORL
 pip install -r requirements/requirements_dev.txt
@@ -213,7 +222,7 @@ If you use CORL in your work, please use the following bibtex
 ```bibtex
 @inproceedings{
 tarasov2022corl,
-  title={{CORL}: Research-oriented Deep Offline Reinforcement Learning Library},
+  title={CORL: Research-oriented Deep Offline Reinforcement Learning Library},
   author={Denis Tarasov and Alexander Nikulin and Dmitry Akimov and Vladislav Kurenkov and Sergey Kolesnikov},
   booktitle={3rd Offline RL Workshop: Offline RL as a ''Launchpad''},
   year={2022},
diff --git a/algorithms/offline/any_percent_bc.py b/algorithms/offline/any_percent_bc.py
index edacc43e..7b6dfa83 100644
--- a/algorithms/offline/any_percent_bc.py
+++ b/algorithms/offline/any_percent_bc.py
@@ -19,26 +19,40 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
-    device: str = "cuda"
-    env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
-    seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
-    eval_freq: int = int(5e3)  # How often (time steps) we evaluate
-    n_episodes: int = 10  # How many episodes run during evaluation
-    max_timesteps: int = int(1e6)  # Max time steps to run environment
-    checkpoints_path: Optional[str] = None  # Save path
-    load_model: str = ""  # Model load file name, "" doesn't load
-    batch_size: int = 256  # Batch size for all networks
-    discount: float = 0.99  # Discount factor
-    # BC
-    buffer_size: int = 2_000_000  # Replay buffer size
-    frac: float = 0.1  # Best data fraction to use
-    max_traj_len: int = 1000  # Max trajectory length
-    normalize: bool = True  # Normalize states
-    # Wandb logging
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "BC-D4RL"
+    # wandb run name
     name: str = "BC"
+    # training dataset and evaluation environment
+    env: str = "halfcheetah-medium-expert-v2"
+    # total gradient updates during training
+    max_timesteps: int = int(1e6)
+    # training batch size
+    batch_size: int = 256
+    # maximum size of the replay buffer
+    buffer_size: int = 2_000_000
+    # what top fraction of the dataset (sorted by return) to use
+    frac: float = 0.1
+    # maximum possible trajectory length
+    max_traj_len: int = 1000
+    # whether to normalize states
+    normalize: bool = True
+    # discount factor
+    discount: float = 0.99
+    # evaluation frequency, will evaluate eval_freq training steps
+    eval_freq: int = int(5e3)
+    # number of episodes to run during evaluation
+    n_episodes: int = 10
+    # path for checkpoints saving, optional
+    checkpoints_path: Optional[str] = None
+    # file name for loading a model, optional
+    load_model: str = ""
+    # training random seed
+    seed: int = 0
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
diff --git a/algorithms/offline/awac.py b/algorithms/offline/awac.py
index 2c652de9..2fb3bf10 100644
--- a/algorithms/offline/awac.py
+++ b/algorithms/offline/awac.py
@@ -20,29 +20,49 @@
 
 @dataclass
 class TrainConfig:
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "AWAC-D4RL"
+    # wandb run name
     name: str = "AWAC"
-    checkpoints_path: Optional[str] = None
-
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-expert-v2"
-    seed: int = 42
-    test_seed: int = 69
-    deterministic_torch: bool = False
-    device: str = "cuda"
-
-    buffer_size: int = 2_000_000
-    num_train_ops: int = 1_000_000
-    batch_size: int = 256
-    eval_frequency: int = 1000
-    n_test_episodes: int = 10
-    normalize_reward: bool = False
-
+    # actor and critic hidden dim
     hidden_dim: int = 256
+    # actor and critic learning rate
     learning_rate: float = 3e-4
+    # discount factor
     gamma: float = 0.99
+    # coefficient for the target critic Polyak's update
     tau: float = 5e-3
+    # awac actor loss temperature, controlling balance
+    # between behaviour cloning and Q-value maximization
     awac_lambda: float = 1.0
+    # total number of gradient updated during training
+    num_train_ops: int = 1_000_000
+    # training batch size
+    batch_size: int = 256
+    # maximum size of the replay buffer
+    buffer_size: int = 2_000_000
+    # whether to normalize reward (like in IQL)
+    normalize_reward: bool = False
+    # evaluation frequency, will evaluate every eval_frequency
+    # training steps
+    eval_frequency: int = 1000
+    # number of episodes to run during evaluation
+    n_test_episodes: int = 10
+    # path for checkpoints saving, optional
+    checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
+    deterministic_torch: bool = False
+    # training random seed
+    seed: int = 42
+    # evaluation random seed
+    test_seed: int = 69
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}"
diff --git a/algorithms/offline/cql.py b/algorithms/offline/cql.py
index a1470eb0..307ffc08 100644
--- a/algorithms/offline/cql.py
+++ b/algorithms/offline/cql.py
@@ -23,7 +23,6 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
     device: str = "cuda"
     env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
     seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
@@ -32,8 +31,6 @@ class TrainConfig:
     max_timesteps: int = int(1e6)  # Max time steps to run environment
     checkpoints_path: Optional[str] = None  # Save path
     load_model: str = ""  # Model load file name, "" doesn't load
-
-    # CQL
     buffer_size: int = 2_000_000  # Replay buffer size
     batch_size: int = 256  # Batch size for all networks
     discount: float = 0.99  # Discount factor
@@ -59,9 +56,7 @@ class TrainConfig:
     q_n_hidden_layers: int = 3  # Number of hidden layers in Q networks
     reward_scale: float = 1.0  # Reward scale for normalization
     reward_bias: float = 0.0  # Reward bias for normalization
-
-    # AntMaze hacks
-    bc_steps: int = int(0)  # Number of BC steps at start
+    bc_steps: int = int(0)  # Number of BC steps at start (AntMaze hacks)
     reward_scale: float = 5.0
     reward_bias: float = -1.0
     policy_log_std_multiplier: float = 1.0
diff --git a/algorithms/offline/dt.py b/algorithms/offline/dt.py
index 37c61e67..367c337e 100644
--- a/algorithms/offline/dt.py
+++ b/algorithms/offline/dt.py
@@ -1,5 +1,5 @@
 # inspiration:
-# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py  # noqa
+# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py
 # 2. https://github.com/karpathy/minGPT
 import os
 import random
@@ -17,44 +17,70 @@
 import wandb
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, IterableDataset
-from tqdm.auto import tqdm, trange  # noqa
+from tqdm.auto import trange
 
 @dataclass
 class TrainConfig:
-    # wandb params
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "DT-D4RL"
+    # wandb run name
     name: str = "DT"
-    # model params
+    # transformer hidden dim
     embedding_dim: int = 128
+    # depth of the transformer model
     num_layers: int = 3
+    # number of heads in the attention
     num_heads: int = 1
+    # maximum sequence length during training
     seq_len: int = 20
+    # maximum rollout length, needed for the positional embeddings
     episode_len: int = 1000
+    # attention dropout
     attention_dropout: float = 0.1
+    # residual dropout
     residual_dropout: float = 0.1
+    # embeddings dropout
     embedding_dropout: float = 0.1
+    # maximum range for the symmetric actions, [-1, 1]
     max_action: float = 1.0
-    # training params
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-v2"
+    # AdamW optimizer learning rate
     learning_rate: float = 1e-4
+    # AdamW optimizer betas
     betas: Tuple[float, float] = (0.9, 0.999)
+    # AdamW weight decay
     weight_decay: float = 1e-4
+    # maximum gradient norm during training, optional
     clip_grad: Optional[float] = 0.25
+    # training batch size
     batch_size: int = 64
+    # total training steps
     update_steps: int = 100_000
+    # warmup steps for the learning rate scheduler
     warmup_steps: int = 10_000
+    # reward scaling, to reduce the magnitude
     reward_scale: float = 0.001
+    # number of workers for the pytorch dataloader
     num_workers: int = 4
-    # evaluation params
+    # target return-to-go for the prompting durint evaluation
     target_returns: Tuple[float, ...] = (12000.0, 6000.0)
+    # number of episodes to run during evaluation
     eval_episodes: int = 100
+    # evaluation frequency, will evaluate eval_every training steps
     eval_every: int = 10_000
-    # general params
+    # path for checkpoints saving, optional
     checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
     deterministic_torch: bool = False
+    # training random seed
     train_seed: int = 10
+    # evaluation random seed
     eval_seed: int = 42
+    # training device
     device: str = "cuda"
 
     def __post_init__(self):
@@ -180,7 +206,7 @@ def __prepare_sample(self, traj_idx, start_idx):
 
         states = (states - self.state_mean) / self.state_std
         returns = returns * self.reward_scale
-        # pad up to seq_len if needed
+        # pad up to seq_len if needed, padding is masked during training
         mask = np.hstack(
             [np.ones(states.shape[0]), np.zeros(self.seq_len - states.shape[0])]
         )
diff --git a/algorithms/offline/edac.py b/algorithms/offline/edac.py
index 413801c9..b668e43f 100644
--- a/algorithms/offline/edac.py
+++ b/algorithms/offline/edac.py
@@ -21,36 +21,58 @@
 
 @dataclass
 class TrainConfig:
-    # wandb params
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "EDAC-D4RL"
+    # wandb run name
     name: str = "EDAC"
-    # model params
+    # actor and critic hidden dim
     hidden_dim: int = 256
+    # critic ensemble size
     num_critics: int = 10
+    # discount factor
     gamma: float = 0.99
+    # coefficient for the target critic Polyak's update
     tau: float = 5e-3
+    # coefficient for the ensemble diversification loss
     eta: float = 1.0
+    # actor learning rate
     actor_learning_rate: float = 3e-4
+    # critic learning rate
     critic_learning_rate: float = 3e-4
+    # alpha learning rate
     alpha_learning_rate: float = 3e-4
+    # maximum range for the symmetric actions, [-1, 1]
     max_action: float = 1.0
-    # training params
+    # maximum size of the replay buffer
     buffer_size: int = 1_000_000
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-v2"
+    # training batch size
     batch_size: int = 256
+    # total number of training epochs
     num_epochs: int = 3000
+    # number of gradient updates during one epoch
     num_updates_on_epoch: int = 1000
+    # whether to normalize reward (like in IQL)
     normalize_reward: bool = False
-    # evaluation params
+    # number of episodes to run during evaluation
     eval_episodes: int = 10
+    # evaluation frequency, will evaluate eval_every training steps
     eval_every: int = 5
-    # general params
+    # path for checkpoints saving, optional
     checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
     deterministic_torch: bool = False
+    # training random seed
     train_seed: int = 10
+    # evaluation random seed
     eval_seed: int = 42
+    # frequency of metrics logging to the wandb
     log_every: int = 100
+    # training device
     device: str = "cpu"
 
     def __post_init__(self):
diff --git a/algorithms/offline/iql.py b/algorithms/offline/iql.py
index 72c28726..14cec70a 100644
--- a/algorithms/offline/iql.py
+++ b/algorithms/offline/iql.py
@@ -29,33 +29,55 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
-    device: str = "cuda"
-    env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
-    seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
-    eval_freq: int = int(5e3)  # How often (time steps) we evaluate
-    n_episodes: int = 10  # How many episodes run during evaluation
-    max_timesteps: int = int(1e6)  # Max time steps to run environment
-    checkpoints_path: Optional[str] = None  # Save path
-    load_model: str = ""  # Model load file name, "" doesn't load
-    # IQL
-    buffer_size: int = 2_000_000  # Replay buffer size
-    batch_size: int = 256  # Batch size for all networks
-    discount: float = 0.99  # Discount factor
-    tau: float = 0.005  # Target network update rate
-    beta: float = 3.0  # Inverse temperature. Small beta -> BC, big beta -> maximizing Q
-    iql_tau: float = 0.7  # Coefficient for asymmetric loss
-    iql_deterministic: bool = False  # Use deterministic actor
-    normalize: bool = True  # Normalize states
-    normalize_reward: bool = False  # Normalize reward
-    vf_lr: float = 3e-4  # V function learning rate
-    qf_lr: float = 3e-4  # Critic learning rate
-    actor_lr: float = 3e-4  # Actor learning rate
-    actor_dropout: Optional[float] = None  # Adroit uses dropout for policy network
-    # Wandb logging
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "IQL-D4RL"
+    # wandb run name
     name: str = "IQL"
+    # training dataset and evaluation environment
+    env: str = "halfcheetah-medium-expert-v2"
+    # discount factor
+    discount: float = 0.99
+    # coefficient for the target critic Polyak's update
+    tau: float = 0.005
+    # actor update inverse temperature, similar to AWAC
+    # small beta -> BC, big beta -> maximizing Q-value
+    beta: float = 3.0
+    # coefficient for asymmetric critic loss
+    iql_tau: float = 0.7
+    # whether to use deterministic actor
+    iql_deterministic: bool = False
+    # total gradient updates during training
+    max_timesteps: int = int(1e6)
+    # maximum size of the replay buffer
+    buffer_size: int = 2_000_000
+    # training batch size
+    batch_size: int = 256
+    # whether to normalize states
+    normalize: bool = True
+    # whether to normalize reward (like in IQL)
+    normalize_reward: bool = False
+    # V-critic function learning rate
+    vf_lr: float = 3e-4
+    # Q-critic learning rate
+    qf_lr: float = 3e-4
+    # actor learning rate
+    actor_lr: float = 3e-4
+    #  where to use dropout for policy network, optional
+    actor_dropout: Optional[float] = None
+    # evaluation frequency, will evaluate every eval_freq training steps
+    eval_freq: int = int(5e3)
+    # number of episodes to run during evaluation
+    n_episodes: int = 10
+    # path for checkpoints saving, optional
+    checkpoints_path: Optional[str] = None
+    # file name for loading a model, optional
+    load_model: str = ""
+    # training random seed
+    seed: int = 0
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
diff --git a/algorithms/offline/lb_sac.py b/algorithms/offline/lb_sac.py
index 71fb8c54..4f10f77e 100644
--- a/algorithms/offline/lb_sac.py
+++ b/algorithms/offline/lb_sac.py
@@ -23,36 +23,58 @@
 # base learning rate: 3e-4
 @dataclass
 class TrainConfig:
-    # wandb params
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "LB-SAC"
+    # wandb run name
     name: str = "LB-SAC"
-    # model params
+    # actor and critic hidden dim
     hidden_dim: int = 256
+    # critic ensemble size
     num_critics: int = 10
+    # discount factor
     gamma: float = 0.99
+    # coefficient for the target critic Polyak's update
     tau: float = 5e-3
+    # actor learning rate (before scaling was 3e-4)
     actor_learning_rate: float = 0.0018
+    # critic learning rate (before scaling was 3e-4)
     critic_learning_rate: float = 0.0018
+    # alpha learning rate (before scaling was 3e-4)
     alpha_learning_rate: float = 0.0018
+    # whether to use layer normalization for critic
     critic_layernorm: bool = False
+    # whether to use initialization from EDAC paper
     edac_init: bool = False
+    # maximum range for the symmetric actions, [-1, 1]
     max_action: float = 1.0
-    # training params
+    # maximum size of the replay buffer
     buffer_size: int = 1_000_000
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-v2"
+    # training batch size
     batch_size: int = 10_000
+    # total number of training epochs
     num_epochs: int = 300
+    # number of gradient updates during one epoch
     num_updates_on_epoch: int = 1000
-    # evaluation params
+    # number of episodes to run during evaluation
     eval_episodes: int = 10
+    # evaluation frequency, will evaluate eval_every training steps
     eval_every: int = 5
-    # general params
+    # path for checkpoints saving, optional
     checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
     deterministic_torch: bool = False
+    # training random seed
     train_seed: int = 10
+    # evaluation random seed
     eval_seed: int = 42
+    # frequency of metrics logging to the wandb
     log_every: int = 100
+    # training device
     device: str = "cpu"
 
     def __post_init__(self):
diff --git a/algorithms/offline/sac_n.py b/algorithms/offline/sac_n.py
index 0b91ddec..a44da091 100644
--- a/algorithms/offline/sac_n.py
+++ b/algorithms/offline/sac_n.py
@@ -21,36 +21,57 @@
 
 @dataclass
 class TrainConfig:
-    # wandb params
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "SAC-N"
+    # wandb run name
     name: str = "SAC-N"
-    # model params
+    # actor and critic hidden dim
     hidden_dim: int = 256
+    # critic ensemble size
     num_critics: int = 10
+    # discount factor
     gamma: float = 0.99
+    # coefficient for the target critic Polyak's update
     tau: float = 5e-3
+    # actor learning rate
     actor_learning_rate: float = 3e-4
+    # critic learning rate
     critic_learning_rate: float = 3e-4
+    # entropy coefficient learning rate for automatic tuning
     alpha_learning_rate: float = 3e-4
+    # maximum range for the symmetric actions, [-1, 1]
     max_action: float = 1.0
-    # training params
+    # maximum size of the replay buffer
     buffer_size: int = 1_000_000
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-v2"
+    # training batch size
     batch_size: int = 256
+    # total number of training epochs
     num_epochs: int = 3000
+    # number of gradient updates during one epoch
     num_updates_on_epoch: int = 1000
+    # whether to normalize reward (like in IQL)
     normalize_reward: bool = False
-    # evaluation params
+    # number of episodes to run during evaluation
     eval_episodes: int = 10
+    # evaluation frequency, will evaluate eval_every training steps
     eval_every: int = 5
-    # general params
+    # path for checkpoints saving, optional
     checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
     deterministic_torch: bool = False
+    # training random seed
     train_seed: int = 10
+    # evaluation random seed
     eval_seed: int = 42
+    # frequency of metrics logging to the wandb
     log_every: int = 100
-    device: str = "cpu"
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}"
@@ -465,6 +486,8 @@ def eval_actor(
     return np.array(episode_rewards)
 
 
+# normalization like in the IQL paper
+# https://github.com/ikostrikov/implicit_q_learning/blob/09d700248117881a75cb21f0adb95c6c8a694cb2/train_offline.py#L35 # noqa
 def return_reward_range(dataset, max_episode_steps):
     returns, lengths = [], []
     ep_ret, ep_len = 0.0, 0
diff --git a/algorithms/offline/td3_bc.py b/algorithms/offline/td3_bc.py
index a78bda30..6ae0379c 100644
--- a/algorithms/offline/td3_bc.py
+++ b/algorithms/offline/td3_bc.py
@@ -22,32 +22,51 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
-    device: str = "cuda"
-    env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
-    seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
-    eval_freq: int = int(5e3)  # How often (time steps) we evaluate
-    n_episodes: int = 10  # How many episodes run during evaluation
-    max_timesteps: int = int(1e6)  # Max time steps to run environment
-    checkpoints_path: Optional[str] = None  # Save path
-    load_model: str = ""  # Model load file name, "" doesn't load
-    # TD3
-    buffer_size: int = 2_000_000  # Replay buffer size
-    batch_size: int = 256  # Batch size for all networks
-    discount: float = 0.99  # Discount ffor
-    expl_noise: float = 0.1  # Std of Gaussian exploration noise
-    tau: float = 0.005  # Target network update rate
-    policy_noise: float = 0.2  # Noise added to target actor during critic update
-    noise_clip: float = 0.5  # Range to clip target actor noise
-    policy_freq: int = 2  # Frequency of delayed actor updates
-    # TD3 + BC
-    alpha: float = 2.5  # Coefficient for Q function in actor loss
-    normalize: bool = True  # Normalize states
-    normalize_reward: bool = False  # Normalize reward
-    # Wandb logging
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "TD3_BC-D4RL"
+    # wandb run name
     name: str = "TD3_BC"
+    # training dataset and evaluation environment
+    env: str = "halfcheetah-medium-expert-v2"
+    # coefficient for the Q-function in actor loss
+    alpha: float = 2.5
+    # discount factor
+    discount: float = 0.99
+    # standard deviation for the gaussian exploration noise
+    expl_noise: float = 0.1
+    # coefficient for the target critic Polyak's update
+    tau: float = 0.005
+    # scalig coefficient for the noise added to
+    # target actor during critic update
+    policy_noise: float = 0.2
+    # range for the target actor noise clipping
+    noise_clip: float = 0.5
+    # actor update delay
+    policy_freq: int = 2
+    # total gradient updates during training
+    max_timesteps: int = int(1e6)
+    # maximum size of the replay buffer
+    buffer_size: int = 2_000_000
+    # training batch size
+    batch_size: int = 256
+    # whether to normalize states
+    normalize: bool = True
+    # whether to normalize reward (like in IQL)
+    normalize_reward: bool = False
+    # evaluation frequency, will evaluate every eval_freq training steps
+    eval_freq: int = int(5e3)
+    # number of episodes to run during evaluation
+    n_episodes: int = 10
+    # path for checkpoints saving, optional
+    checkpoints_path: Optional[str] = None
+    # file name for loading a model, optional
+    load_model: str = ""
+    # training random seed
+    seed: int = 0
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
diff --git a/docs/algorithms/awac.md b/docs/algorithms/awac.md
new file mode 100644
index 00000000..94731ee0
--- /dev/null
+++ b/docs/algorithms/awac.md
@@ -0,0 +1 @@
+# AWAC
diff --git a/docs/algorithms/bc.md b/docs/algorithms/bc.md
new file mode 100644
index 00000000..7f00d30c
--- /dev/null
+++ b/docs/algorithms/bc.md
@@ -0,0 +1 @@
+# BC
\ No newline at end of file
diff --git a/docs/algorithms/cal-ql.md b/docs/algorithms/cal-ql.md
new file mode 100644
index 00000000..798de1ef
--- /dev/null
+++ b/docs/algorithms/cal-ql.md
@@ -0,0 +1 @@
+# Cal-QL
diff --git a/docs/algorithms/cql.md b/docs/algorithms/cql.md
new file mode 100644
index 00000000..194cc7a1
--- /dev/null
+++ b/docs/algorithms/cql.md
@@ -0,0 +1 @@
+# CQL
\ No newline at end of file
diff --git a/docs/algorithms/dt.md b/docs/algorithms/dt.md
new file mode 100644
index 00000000..e3218c9d
--- /dev/null
+++ b/docs/algorithms/dt.md
@@ -0,0 +1,130 @@
+---
+hide:
+  - toc        # Hide table of contents
+---
+
+# DT
+
+## Overview
+
+The Decision Transformer (DT) model casts offline reinforcement learning as a conditional sequence modeling problem. 
+
+Unlike prior approaches to offline RL that fit value functions or compute policy gradients, Decision Transformer simply outputs the optimal 
+actions by leveraging a causally masked Transformer. By conditioning an autoregressive model on the desired return
+(reward-to-go), past states, and actions, Decision Transformer model can generate future actions that achieve the desired return. 
+
+Original paper:
+
+ * [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
+ * [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039)
+   (similar approach, came out at the same time)
+
+Reference resources:
+
+* :material-github: [Official codebase for Decision Transformer](https://github.com/kzl/decision-transformer)
+
+!!! success
+        Due to the simple supervised objective and transformer architecture, Decision Transformer is simple, stable and easy to implement as it
+        has a minimum number of moving parts.
+
+!!! warning
+        Despite its simplicity and stability, DT has a number of drawbacks. It does not capable of stitching suboptimal 
+        trajectories (that's why poor performance on AntMaze datasets), and can also [show](https://arxiv.org/abs/2205.15967) bad performance in stochastic environments. 
+    
+Possible extensions:
+
+* [Online Decision Transformer](https://arxiv.org/abs/2202.05607)
+* [Emergent Agentic Transformer from Chain of Hindsight Experience](https://arxiv.org/abs/2305.16554)
+* [Q-learning Decision Transformer: Leveraging Dynamic Programming for Conditional Sequence Modelling in Offline RL](https://proceedings.mlr.press/v202/yamagata23a.html)
+
+We'd be glad if someone would be interested in contributing them!
+
+## Implemented Variants
+
+| Variants Implemented                                                                                                                                                                                         | Description                                                              |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
+| :material-github: [`offline/dt.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py) <br> :material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/dt) | For continuous action spaces and offline RL without fine-tuning support. |
+
+
+## Explanation of logged metrics
+
+* `eval/{target_return}_return_mean`: mean undiscounted evaluation return when prompted with `config.target_return` value (there might be more than one)
+* `eval/{target_return}_return_std`: standard deviation of the undiscounted evaluation return across `config.eval_episodes` episodes
+* `eval/{target_return}_normalized_score_mean`: mean normalized score when prompted with `config.target_return` value (there might be more than one). 
+  Should be between 0 and 100, where 100+ is the performance above expert for this environment. 
+  Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)].
+* `eval/{target_return}_normalized_score_std`: standard deviation of the normalized score return across `config.eval_episodes` episodes
+* `train_loss`: current training loss, Mean squared error (MSE) for continuous action spaces
+* `learning_rate`: current learning rate, helps monitor learning rate schedule
+
+## Implementation details
+
+1. Batch sampling weighted by trajectory length (:material-github: [algorithms/offline/dt.py#L171](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L171))
+2. State normalization during training and inference (:material-github: [algorithms/offline/dt.py#L181](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L181))
+3. Reward downscaling (:material-github: [algorithms/offline/dt.py#L182](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L182))
+4. Positional embedding shared across one transition (:material-github: [algorithms/offline/dt.py#L323](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L323))
+5. Prompting with multiple return-to-go's during evaluation, as DT can be sensitive to the prompt (:material-github: [algorithms/offline/dt.py#L498](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/dt.py#L498))
+
+## Experimental results
+
+For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md). 
+Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly.
+
+<iframe src="https://wandb.ai/tlab/CORL/reports/-Offline-Decision-Transformer--VmlldzoyNzA2MTk3" style="width:100%; height:500px" title="Decision Transformer Report"></iframe>
+
+## Training options
+
+```commandline
+usage: dt.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--embedding_dim int] [--num_layers int]
+             [--num_heads int] [--seq_len int] [--episode_len int] [--attention_dropout float] [--residual_dropout float]
+             [--embedding_dropout float] [--max_action float] [--env_name str] [--learning_rate float]
+             [--betas float float] [--weight_decay float] [--clip_grad [float]] [--batch_size int] [--update_steps int]
+             [--warmup_steps int] [--reward_scale float] [--num_workers int] [--target_returns float [float, ...]]
+             [--eval_episodes int] [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool]
+             [--train_seed int] [--eval_seed int] [--device str]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config_path str     Path for a config file to parse with pyrallis (default: None)
+
+TrainConfig:
+
+  --project str         wandb project name (default: CORL)
+  --group str           wandb group name (default: DT-D4RL)
+  --name str            wandb run name (default: DT)
+  --embedding_dim int   transformer hidden dim (default: 128)
+  --num_layers int      depth of the transformer model (default: 3)
+  --num_heads int       number of heads in the attention (default: 1)
+  --seq_len int         maximum sequence length during training (default: 20)
+  --episode_len int     maximum rollout length, needed for the positional embeddings (default: 1000)
+  --attention_dropout float
+                        attention dropout (default: 0.1)
+  --residual_dropout float
+                        residual dropout (default: 0.1)
+  --embedding_dropout float
+                        embeddings dropout (default: 0.1)
+  --max_action float    maximum range for the symmetric actions, [-1, 1] (default: 1.0)
+  --env_name str        training dataset and evaluation environment (default: halfcheetah-medium-v2)
+  --learning_rate float
+                        AdamW optimizer learning rate (default: 0.0001)
+  --betas float float   AdamW optimizer betas (default: (0.9, 0.999))
+  --weight_decay float  AdamW weight decay (default: 0.0001)
+  --clip_grad [float]   maximum gradient norm during training, optional (default: 0.25)
+  --batch_size int      training batch size (default: 64)
+  --update_steps int    total training steps (default: 100000)
+  --warmup_steps int    warmup steps for the learning rate scheduler (default: 10000)
+  --reward_scale float  reward scaling, to reduce the magnitude (default: 0.001)
+  --num_workers int     number of workers for the pytorch dataloader (default: 4)
+  --target_returns float [float, ...]
+                        target return-to-go for the prompting durint evaluation (default: (12000.0, 6000.0))
+  --eval_episodes int   number of episodes to run during evaluation (default: 100)
+  --eval_every int      evaluation frequency, will evaluate eval_every training steps (default: 10000)
+  --checkpoints_path [str]
+                        path for checkpoints saving, optional (default: None)
+  --deterministic_torch bool
+                        configure PyTorch to use deterministic algorithms instead of nondeterministic ones (default: False)
+  --train_seed int      training random seed (default: 10)
+  --eval_seed int       evaluation random seed (default: 42)
+  --device str          training device (default: cuda)
+```
+
diff --git a/docs/algorithms/edac.md b/docs/algorithms/edac.md
new file mode 100644
index 00000000..68c4a050
--- /dev/null
+++ b/docs/algorithms/edac.md
@@ -0,0 +1 @@
+# EDAC
diff --git a/docs/algorithms/iql.md b/docs/algorithms/iql.md
new file mode 100644
index 00000000..d619a8c4
--- /dev/null
+++ b/docs/algorithms/iql.md
@@ -0,0 +1 @@
+# IQL
\ No newline at end of file
diff --git a/docs/algorithms/lb-sac.md b/docs/algorithms/lb-sac.md
new file mode 100644
index 00000000..4498896c
--- /dev/null
+++ b/docs/algorithms/lb-sac.md
@@ -0,0 +1 @@
+# LB-SAC
diff --git a/docs/algorithms/rebrac.md b/docs/algorithms/rebrac.md
new file mode 100644
index 00000000..6978645c
--- /dev/null
+++ b/docs/algorithms/rebrac.md
@@ -0,0 +1 @@
+# ReBRAC
\ No newline at end of file
diff --git a/docs/algorithms/sac-n.md b/docs/algorithms/sac-n.md
new file mode 100644
index 00000000..8c05636a
--- /dev/null
+++ b/docs/algorithms/sac-n.md
@@ -0,0 +1,153 @@
+---
+hide:
+  - toc        # Hide table of contents
+---
+
+# SAC-N
+
+## Overview
+
+SAC-N is a simple extension of well known online Soft Actor Critic (SAC) algorithm. For an overview of online SAC, 
+see the excellent [documentation at **CleanRL**](https://docs.cleanrl.dev/rl-algorithms/sac/). SAC utilizes a conventional
+technique from online RL, Clipped Double Q-learning, which uses the minimum value of two parallel Q-networks 
+as the Bellman target. SAC-N modifies SAC by increasing the size of the Q-ensemble from $2$ to $N$ to prevent the overestimation.
+That's it!
+
+
+Critic loss (change in blue):
+
+$$
+\min _{\phi_i} \mathbb{E}_{\mathbf{s}, \mathbf{a}, \mathbf{s}^{\prime} \sim \mathcal{D}}\left[\left(Q_{\phi_i}(\mathbf{s}, \mathbf{a})-\left(r(\mathbf{s}, \mathbf{a})+\gamma \mathbb{E}_{\mathbf{a}^{\prime} \sim \pi_\theta\left(\cdot \mid \mathbf{s}^{\prime}\right)}\left[\min _{\color{blue}{j=1, \ldots, N}} Q_{\phi_j^{\prime}}\left(\mathbf{s}^{\prime}, \mathbf{a}^{\prime}\right)-\alpha \log \pi_\theta\left(\mathbf{a}^{\prime} \mid \mathbf{s}^{\prime}\right)\right]\right)\right)^2\right]
+$$
+
+Actor loss (change in blue):
+
+$$
+\max _\theta \mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \pi_\theta(\cdot \mid \mathbf{s})}\left[\min _{\color{blue}{j=1, \ldots, N}} Q_{\phi_j}(\mathbf{s}, \mathbf{a})-\alpha \log \pi_\theta(\mathbf{a} \mid \mathbf{s})\right]
+$$
+
+Why does it work? There is a simple intuition given in the original paper. The clipped Q-learning algorithm, which chooses the 
+worst-case Q-value instead to compute the pessimistic estimate, can also be interpreted as utilizing the LCB of the Q-value
+predictions. Suppose $Q(s, a)$ follows a Gaussian distribution with mean $m(s, a)$ and standard deviation $\sigma(s, a)$. Also, 
+let $\left\{Q_j(\mathbf{s}, \mathbf{a})\right\}_{j=1}^N$ be realizations of $Q(s, a)$. Then, we can approximate the expected minimum of the realizations as
+
+$$
+\mathbb{E}\left[\min _{j=1, \ldots, N} Q_j(\mathbf{s}, \mathbf{a})\right] \approx m(\mathbf{s}, \mathbf{a})-\Phi^{-1}\left(\frac{N-\frac{\pi}{8}}{N-\frac{\pi}{4}+1}\right) \sigma(\mathbf{s}, \mathbf{a})
+$$
+
+where $\Phi$ is the CDF of the standard Gaussian distribution. This relation indicates that using the clipped Q-value 
+is similar to penalizing the ensemble mean of the Q-values with the standard deviation scaled by a coefficient dependent on $N$.
+For OOD actions, the standard deviation will be higher, and thus the penalty will be stronger, preventing divergence.
+
+Original paper:
+
+* [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble](https://arxiv.org/abs/2110.01548)
+
+Reference resources:
+
+* :material-github: [Official codebase for SAC-N and EDAC](https://github.com/snu-mllab/EDAC)
+
+
+!!! success
+        SAC-N is extremely simple extension of online SAC and works quite well out of box on majority of the benchmarks.
+        Usually only one parameter needs tuning - the size of the critics ensemble. It has SOTA results on the D4RL-Mujoco domain.
+
+!!! warning
+        Typically, SAC-N requires more time to converge, 3M updates instead of the usual 1M. Also, more complex tasks
+        may require a larger ensemble size, which will considerably increase training time. Finally, 
+        SAC-N mysteriously does not work on the AntMaze domain. If you know how to fix this, let us know, it would be awesome!
+
+
+Possible extensions:
+
+* [Anti-Exploration by Random Network Distillation](https://arxiv.org/abs/2301.13616)
+* [Why So Pessimistic? Estimating Uncertainties for Offline RL through Ensembles, and Why Their Independence Matters](https://arxiv.org/abs/2205.13703)
+
+We'd be glad if someone would be interested in contributing them!
+
+## Implemented Variants
+
+| Variants Implemented                                                                                                                                                                                                 | Description                                                              |
+|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------|
+| :material-github:[`offline/sac_n.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/sac_n.py) <br> :material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/sac_n) | For continuous action spaces and offline RL without fine-tuning support. |
+
+
+## Explanation of logged metrics
+
+* `critic_loss`: sum of the Q-ensemble individual mean losses (for loss definition see above) 
+* `actor_loss`: mean actor loss (for loss definition see above)
+* `alpha_loss`: entropy regularization coefficient loss for automatic policy entropy tuning (see **CleanRL** docs for more details)
+* `batch_entropy`: estimation of the policy distribution entropy based on the batch states
+* `alpha`: coefficient for entropy regularization of the policy
+* `q_policy_std`: standard deviation of the Q-ensemble on batch of states and policy actions
+* `q_random_std`: standard deviation of the Q-ensemble on batch of states and random (OOD) actions
+* `eval/reward_mean`: mean undiscounted evaluation return
+* `eval/reward_std`: standard deviation of the undiscounted evaluation return across `config.eval_episodes` episodes
+* `eval/normalized_score_mean`: mean evaluation normalized score. Should be between 0 and 100, where 100+ is the 
+  performance above expert for this environment. Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)].
+* `eval/normalized_score_std`: standard deviation of the evaluation normalized score across `config.eval_episodes` episodes
+
+## Implementation details
+
+1. Efficient ensemble implementation with vectorized linear layers (:material-github:[algorithms/offline/sac_n.py#L174](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L174))
+2. Actor last layer initialization with small values (:material-github:[algorithms/offline/sac_n.py#L223](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L223))
+3. Critic last layer initialization with small values (but bigger than in actor) (:material-github:[algorithms/offline/sac_n.py#L283](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L283))
+4. Clipping bounds for actor `log_std` are different from original the online SAC (:material-github:[algorithms/offline/sac_n.py#L241](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/sac_n.py#L241))
+
+## Experimental results
+
+For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md). 
+Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly.
+
+<iframe src="https://wandb.ai/tlab/CORL/reports/-Offline-SAC-N--VmlldzoyNzA1NTY1" style="width:100%; height:500px" title="SAC-N Report"></iframe>
+
+## Training options
+
+```commandline
+usage: sac_n.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--hidden_dim int] [--num_critics int]
+                [--gamma float] [--tau float] [--actor_learning_rate float] [--critic_learning_rate float]
+                [--alpha_learning_rate float] [--max_action float] [--buffer_size int] [--env_name str] [--batch_size int]
+                [--num_epochs int] [--num_updates_on_epoch int] [--normalize_reward bool] [--eval_episodes int]
+                [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool] [--train_seed int]
+                [--eval_seed int] [--log_every int] [--device str]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config_path str     Path for a config file to parse with pyrallis (default: None)
+
+TrainConfig:
+
+  --project str         wandb project name (default: CORL)
+  --group str           wandb group name (default: SAC-N)
+  --name str            wandb run name (default: SAC-N)
+  --hidden_dim int      actor and critic hidden dim (default: 256)
+  --num_critics int     critic ensemble size (default: 10)
+  --gamma float         discount factor (default: 0.99)
+  --tau float           coefficient for the target critic Polyak's update (default: 0.005)
+  --actor_learning_rate float
+                        actor learning rate (default: 0.0003)
+  --critic_learning_rate float
+                        critic learning rate (default: 0.0003)
+  --alpha_learning_rate float
+                        entropy coefficient learning rate for automatic tuning (default: 0.0003)
+  --max_action float    maximum range for the symmetric actions, [-1, 1] (default: 1.0)
+  --buffer_size int     maximum size of the replay buffer (default: 1000000)
+  --env_name str        training dataset and evaluation environment (default: halfcheetah-medium-v2)
+  --batch_size int      training batch size (default: 256)
+  --num_epochs int      total number of training epochs (default: 3000)
+  --num_updates_on_epoch int
+                        number of gradient updates during one epoch (default: 1000)
+  --normalize_reward bool
+                        whether to normalize reward (like in IQL) (default: False)
+  --eval_episodes int   number of episodes to run during evaluation (default: 10)
+  --eval_every int      evaluation frequency, will evaluate eval_every training steps (default: 5)
+  --checkpoints_path [str]
+                        path for checkpoints saving, optional (default: None)
+  --deterministic_torch bool
+                        configure PyTorch to use deterministic algorithms instead of nondeterministic ones (default: False)
+  --train_seed int      training random seed (default: 10)
+  --eval_seed int       evaluation random seed (default: 42)
+  --log_every int       frequency of metrics logging to the wandb (default: 100)
+  --device str          training device (default: cpu)
+```
+
diff --git a/docs/algorithms/spot.md b/docs/algorithms/spot.md
new file mode 100644
index 00000000..bc991a94
--- /dev/null
+++ b/docs/algorithms/spot.md
@@ -0,0 +1 @@
+# SPOT
\ No newline at end of file
diff --git a/docs/algorithms/td3-bc.md b/docs/algorithms/td3-bc.md
new file mode 100644
index 00000000..8199ab8f
--- /dev/null
+++ b/docs/algorithms/td3-bc.md
@@ -0,0 +1 @@
+# TD3+BC
\ No newline at end of file
diff --git a/docs/assets/corl.pdf b/docs/assets/corl.pdf
new file mode 100644
index 00000000..d4eb872c
Binary files /dev/null and b/docs/assets/corl.pdf differ
diff --git a/docs/assets/logo.jpeg b/docs/assets/logo.jpeg
new file mode 100644
index 00000000..6fc0a734
Binary files /dev/null and b/docs/assets/logo.jpeg differ
diff --git a/docs/assets/perf_profiles_offline.pdf b/docs/assets/perf_profiles_offline.pdf
new file mode 100644
index 00000000..33c2cb0e
Binary files /dev/null and b/docs/assets/perf_profiles_offline.pdf differ
diff --git a/docs/assets/perf_profiles_online.pdf b/docs/assets/perf_profiles_online.pdf
new file mode 100644
index 00000000..c59684ac
Binary files /dev/null and b/docs/assets/perf_profiles_online.pdf differ
diff --git a/docs/benchmarks/offline-to-online.md b/docs/benchmarks/offline-to-online.md
new file mode 100644
index 00000000..32f9e4d4
--- /dev/null
+++ b/docs/benchmarks/offline-to-online.md
@@ -0,0 +1,60 @@
+---
+hide:
+  - toc        # Hide table of contents
+---
+
+# Offline-to-online
+
+ Here, we report reproduced scores after offline pretraining and online fine-tuning for all datasets and offline-to-online algorithms considered. 
+ 
+!!! tip
+        If you want to re-collect our results in a more structured/nuanced manner, see [how to reproduce](repro.md) section.
+
+## Scores
+
+### Antmaze
+| **Task-Name**            |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|antmaze-umaze-v2|52.75 ± 8.67 →  98.75 ± 1.09|94.00 ± 1.58 →  99.50 ± 0.87|77.00 ± 0.71 →  96.50 ± 1.12|91.00 ± 2.55 →  99.50 ± 0.50|76.75 ± 7.53 →  99.75 ± 0.43|
+|antmaze-umaze-diverse-v2|56.00 ± 2.74 →  0.00 ± 0.00|9.50 ± 9.91 →  99.00 ± 1.22|59.50 ± 9.55 →  63.75 ± 25.02|36.25 ± 2.17 →  95.00 ± 3.67|32.00 ± 27.79 →  98.50 ± 1.12|
+|antmaze-medium-play-v2|0.00 ± 0.00 →  0.00 ± 0.00|59.00 ± 11.18 →  97.75 ± 1.30|71.75 ± 2.95 →  89.75 ± 1.09|67.25 ± 10.47 →  97.25 ± 1.30|71.75 ± 3.27 →  98.75 ± 1.64|
+|antmaze-medium-diverse-v2|0.00 ± 0.00 →  0.00 ± 0.00|63.50 ± 6.84 →  97.25 ± 1.92|64.25 ± 1.92 →  92.25 ± 2.86|73.75 ± 7.29 →  94.50 ± 1.66|62.00 ± 4.30 →  98.25 ± 1.48|
+|antmaze-large-play-v2|0.00 ± 0.00 →  0.00 ± 0.00|28.75 ± 7.76 →  88.25 ± 2.28|38.50 ± 8.73 →  64.50 ± 17.04|31.50 ± 12.58 →  87.00 ± 3.24|31.75 ± 8.87 →  97.25 ± 1.79|
+|antmaze-large-diverse-v2|0.00 ± 0.00 →  0.00 ± 0.00|35.50 ± 3.64 →  91.75 ± 3.96|26.75 ± 3.77 →  64.25 ± 4.15|17.50 ± 7.26 →  81.00 ± 14.14|44.00 ± 8.69 →  91.50 ± 3.91|
+| **average**       |18.12 →  16.46|48.38 →  95.58|56.29 →  78.50|52.88 →  92.38|53.04 →  97.33|
+
+### Adroit
+| **Task-Name**            |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|pen-cloned-v1|88.66 ± 15.10 →  86.82 ± 11.12|-2.76 ± 0.08 →  -1.28 ± 2.16|84.19 ± 3.96 →  102.02 ± 20.75|6.19 ± 5.21 →  43.63 ± 20.09|-2.66 ± 0.04 →  -2.68 ± 0.12|
+|door-cloned-v1|0.93 ± 1.66 →  0.01 ± 0.00|-0.33 ± 0.01 →  -0.33 ± 0.01|1.19 ± 0.93 →  20.34 ± 9.32|-0.21 ± 0.14 →  0.02 ± 0.31|-0.33 ± 0.01 →  -0.33 ± 0.01|
+|hammer-cloned-v1|1.80 ± 3.01 →  0.24 ± 0.04|0.56 ± 0.55 →  2.85 ± 4.81|1.35 ± 0.32 →  57.27 ± 28.49|3.97 ± 6.39 →  3.73 ± 4.99|0.25 ± 0.04 →  0.17 ± 0.17|
+|relocate-cloned-v1|-0.04 ± 0.04 →  -0.04 ± 0.01|-0.33 ± 0.01 →  -0.33 ± 0.01|0.04 ± 0.04 →  0.32 ± 0.38|-0.24 ± 0.01 →  -0.15 ± 0.05|-0.31 ± 0.05 →  -0.31 ± 0.04|
+| **average**        |22.84 →  21.76|-0.72 →  0.22|21.69 →  44.99|2.43 →  11.81|-0.76 →  -0.79|
+
+## Regrets
+
+### Antmaze
+| **Task-Name**            |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|antmaze-umaze-v2|0.04 ± 0.01|0.02 ± 0.00|0.07 ± 0.00|0.02 ± 0.00|0.01 ± 0.00|
+|antmaze-umaze-diverse-v2|0.88 ± 0.01|0.09 ± 0.01|0.43 ± 0.11|0.22 ± 0.07|0.05 ± 0.01|
+|antmaze-medium-play-v2|1.00 ± 0.00|0.08 ± 0.01|0.09 ± 0.01|0.06 ± 0.00|0.04 ± 0.01|
+|antmaze-medium-diverse-v2|1.00 ± 0.00|0.08 ± 0.00|0.10 ± 0.01|0.05 ± 0.01|0.04 ± 0.01|
+|antmaze-large-play-v2|1.00 ± 0.00|0.21 ± 0.02|0.34 ± 0.05|0.29 ± 0.07|0.13 ± 0.02|
+|antmaze-large-diverse-v2|1.00 ± 0.00|0.21 ± 0.03|0.41 ± 0.03|0.23 ± 0.08|0.13 ± 0.02|
+| **average**       |0.82|0.11|0.24|0.15|0.07|
+
+### Adroit
+
+| **Task-Name**            |AWAC|CQL|IQL|SPOT|Cal-QL|
+|--------------------------|------------|--------|--------|-----|-----|
+|pen-cloned-v1|0.46 ± 0.02|0.97 ± 0.00|0.37 ± 0.01|0.58 ± 0.02|0.98 ± 0.01|
+|door-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.83 ± 0.03|0.99 ± 0.01|1.00 ± 0.00|
+|hammer-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|0.65 ± 0.10|0.98 ± 0.01|1.00 ± 0.00|
+|relocate-cloned-v1|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|1.00 ± 0.00|
+| **average**        |0.86|0.99|0.71|0.89|0.99|
+
+## Visual summary
+
+![](../assets/perf_profiles_online.pdf)
\ No newline at end of file
diff --git a/docs/benchmarks/offline.md b/docs/benchmarks/offline.md
new file mode 100644
index 00000000..f83e0746
--- /dev/null
+++ b/docs/benchmarks/offline.md
@@ -0,0 +1,122 @@
+---
+hide:
+  - toc        # Hide table of contents
+---
+
+# Offline
+
+ Here, we report reproduced **final** and **best** scores for all datasets and offline algorithms considered. Note that they differ by a significant 
+ margin, and some papers may use different approaches, not making it always explicit which reporting methodology they chose. 
+ 
+!!! tip
+        If you want to re-collect our results in a more structured/nuanced manner, see [how to reproduce](repro.md) section.
+
+## Last Scores
+### Gym-MuJoCo
+
+| **Task-Name**|BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N| EDAC           |DT|
+|------------------------------|------------|--------|--------|--------|-----|-----|------|-------|----------------|----|
+|halfcheetah-medium-v2|42.40 ± 0.19|42.46 ± 0.70|48.10 ± 0.18|50.02 ± 0.27|47.04 ± 0.22|48.31 ± 0.22|64.04 ± 0.68|68.20 ± 1.28| 67.70 ± 1.04   |42.20 ± 0.26|
+|halfcheetah-medium-replay-v2|35.66 ± 2.33|23.59 ± 6.95|44.84 ± 0.59|45.13 ± 0.88|45.04 ± 0.27|44.46 ± 0.22|51.18 ± 0.31|60.70 ± 1.01| 62.06 ± 1.10   |38.91 ± 0.50|
+|halfcheetah-medium-expert-v2|55.95 ± 7.35|90.10 ± 2.45|90.78 ± 6.04|95.00 ± 0.61|95.63 ± 0.42|94.74 ± 0.52|103.80 ± 2.95|98.96 ± 9.31| 104.76 ± 0.64  |91.55 ± 0.95|
+|hopper-medium-v2|53.51 ± 1.76|55.48 ± 7.30|60.37 ± 3.49|63.02 ± 4.56|59.08 ± 3.77|67.53 ± 3.78|102.29 ± 0.17|40.82 ± 9.91| 101.70 ± 0.28  |65.10 ± 1.61|
+|hopper-medium-replay-v2|29.81 ± 2.07|70.42 ± 8.66|64.42 ± 21.52|98.88 ± 2.07|95.11 ± 5.27|97.43 ± 6.39|94.98 ± 6.53|100.33 ± 0.78| 99.66 ± 0.81   |81.77 ± 6.87|
+|hopper-medium-expert-v2|52.30 ± 4.01|111.16 ± 1.03|101.17 ± 9.07|101.90 ± 6.22|99.26 ± 10.91|107.42 ± 7.80|109.45 ± 2.34|101.31 ± 11.63| 105.19 ± 10.08 |110.44 ± 0.33|
+|walker2d-medium-v2|63.23 ± 16.24|67.34 ± 5.17|82.71 ± 4.78|68.52 ± 27.19|80.75 ± 3.28|80.91 ± 3.17|85.82 ± 0.77|87.47 ± 0.66| 93.36 ± 1.38   |67.63 ± 2.54|
+|walker2d-medium-replay-v2|21.80 ± 10.15|54.35 ± 6.34|85.62 ± 4.01|80.62 ± 3.58|73.09 ± 13.22|82.15 ± 3.03|84.25 ± 2.25|78.99 ± 0.50| 87.10 ± 2.78   |59.86 ± 2.73|
+|walker2d-medium-expert-v2|98.96 ± 15.98|108.70 ± 0.25|110.03 ± 0.36|111.44 ± 1.62|109.56 ± 0.39|111.72 ± 0.86|111.86 ± 0.43|114.93 ± 0.41| 114.75 ± 0.74  |107.11 ± 0.96|
+| **locomotion average**       |50.40|69.29|76.45|79.39|78.28|81.63|89.74|83.52| **92.92**      |73.84|
+
+### Maze2d
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC| SAC-N          |EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|------|----------------|------|----|
+|maze2d-umaze-v1|0.36 ± 8.69|12.18 ± 4.29|29.41 ± 12.31|65.65 ± 5.34|-8.90 ± 6.11|42.11 ± 0.58|106.87 ± 22.16| 130.59 ± 16.52 |95.26 ± 6.39|18.08 ± 25.42|
+|maze2d-medium-v1|0.79 ± 3.25|14.25 ± 2.33|59.45 ± 36.25|84.63 ± 35.54|86.11 ± 9.68|34.85 ± 2.72|105.11 ± 31.67| 88.61 ± 18.72  |57.04 ± 3.45|31.71 ± 26.33|
+|maze2d-large-v1|2.26 ± 4.39|11.32 ± 5.10|97.10 ± 25.41|215.50 ± 3.11|23.75 ± 36.70|61.72 ± 3.50|78.33 ± 61.77| 204.76 ± 1.19  |95.60 ± 22.92|35.66 ± 28.20|
+| **maze2d average** |1.13|12.58|61.99|121.92|33.65|46.23|96.77| **141.32**     |82.64|28.48|
+
+### Antmaze
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC        |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|antmaze-umaze-v2|55.25 ± 4.15|65.75 ± 5.26|70.75 ± 39.18|56.75 ± 9.09|92.75 ± 1.92|77.00 ± 5.52| 97.75 ± 1.48  |0.00 ± 0.00|0.00 ± 0.00|57.00 ± 9.82|
+|antmaze-umaze-diverse-v2|47.25 ± 4.09|44.00 ± 1.00|44.75 ± 11.61|54.75 ± 8.01|37.25 ± 3.70|54.25 ± 5.54| 83.50 ± 7.02  |0.00 ± 0.00|0.00 ± 0.00|51.75 ± 0.43|
+|antmaze-medium-play-v2|0.00 ± 0.00|2.00 ± 0.71|0.25 ± 0.43|0.00 ± 0.00|65.75 ± 11.61|65.75 ± 11.71| 89.50 ± 3.35  |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-medium-diverse-v2|0.75 ± 0.83|5.75 ± 9.39|0.25 ± 0.43|0.00 ± 0.00|67.25 ± 3.56|73.75 ± 5.45| 83.50 ± 8.20  |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-large-play-v2|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|20.75 ± 7.26|42.00 ± 4.53| 52.25 ± 29.01 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-large-diverse-v2|0.00 ± 0.00|0.75 ± 0.83|0.00 ± 0.00|0.00 ± 0.00|20.50 ± 13.24|30.25 ± 3.63| 64.00 ± 5.43  |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+| **antmaze average**           | 17.21|19.71|19.33|18.58|50.71|57.17| **78.42**     |0.00|0.00|18.12|
+
+### Adroit
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC        |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|pen-human-v1|71.03 ± 6.26|26.99 ± 9.60|-3.88 ± 0.21|76.65 ± 11.71|13.71 ± 16.98|78.49 ± 8.21| 103.16 ± 8.49 |6.86 ± 5.93|5.07 ± 6.16|67.68 ± 5.48|
+|pen-cloned-v1|51.92 ± 15.15|46.67 ± 14.25|5.13 ± 5.28|85.72 ± 16.92|1.04 ± 6.62|83.42 ± 8.19| 102.79 ± 7.84 |31.35 ± 2.14|12.02 ± 1.75|64.43 ± 1.43|
+|pen-expert-v1|109.65 ± 7.28|114.96 ± 2.96|122.53 ± 21.27|159.91 ± 1.87|-1.41 ± 2.34|128.05 ± 9.21| 152.16 ± 6.33 |87.11 ± 48.95|-1.55 ± 0.81|116.38 ± 1.27|
+|door-human-v1|2.34 ± 4.00|-0.13 ± 0.07|-0.33 ± 0.01|2.39 ± 2.26|5.53 ± 1.31|3.26 ± 1.83| -0.10 ± 0.01  |-0.38 ± 0.00|-0.12 ± 0.13|4.44 ± 0.87|
+|door-cloned-v1|-0.09 ± 0.03|0.29 ± 0.59|-0.34 ± 0.01|-0.01 ± 0.01|-0.33 ± 0.01|3.07 ± 1.75| 0.06 ± 0.05   |-0.33 ± 0.00|2.66 ± 2.31|7.64 ± 3.26|
+|door-expert-v1|105.35 ± 0.09|104.04 ± 1.46|-0.33 ± 0.01|104.57 ± 0.31|-0.32 ± 0.02|106.65 ± 0.25| 106.37 ± 0.29 |-0.33 ± 0.00|106.29 ± 1.73|104.87 ± 0.39|
+|hammer-human-v1|3.03 ± 3.39|-0.19 ± 0.02|1.02 ± 0.24|1.01 ± 0.51|0.14 ± 0.11|1.79 ± 0.80| 0.24 ± 0.24   |0.24 ± 0.00|0.28 ± 0.18|1.28 ± 0.15|
+|hammer-cloned-v1|0.55 ± 0.16|0.12 ± 0.08|0.25 ± 0.01|1.27 ± 2.11|0.30 ± 0.01|1.50 ± 0.69| 5.00 ± 3.75   |0.14 ± 0.09|0.19 ± 0.07|1.82 ± 0.55|
+|hammer-expert-v1|126.78 ± 0.64|121.75 ± 7.67|3.11 ± 0.03|127.08 ± 0.13|0.26 ± 0.01|128.68 ± 0.33| 133.62 ± 0.27 |25.13 ± 43.25|28.52 ± 49.00|117.45 ± 6.65|
+|relocate-human-v1|0.04 ± 0.03|-0.14 ± 0.08|-0.29 ± 0.01|0.45 ± 0.53|0.06 ± 0.03|0.12 ± 0.04| 0.16 ± 0.30   |-0.31 ± 0.01|-0.17 ± 0.17|0.05 ± 0.01|
+|relocate-cloned-v1|-0.06 ± 0.01|-0.00 ± 0.02|-0.30 ± 0.01|-0.01 ± 0.03|-0.29 ± 0.01|0.04 ± 0.01| 1.66 ± 2.59   |-0.01 ± 0.10|0.17 ± 0.35|0.16 ± 0.09|
+|relocate-expert-v1|107.58 ± 1.20|97.90 ± 5.21|-1.73 ± 0.96|109.52 ± 0.47|-0.30 ± 0.02|106.11 ± 4.02| 107.52 ± 2.28 |-0.36 ± 0.00|71.94 ± 18.37|104.28 ± 0.42|
+|                    |            |        |        |     |     |      |               |      |    |  |
+| **adroit average**        | 48.18|42.69|10.40|55.71|1.53|53.43| **59.39**     |12.43|18.78|49.21|
+
+## Best Scores
+### Gym-MuJoCo
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL|ReBRAC|SAC-N| EDAC          |DT|
+|--------------------|------------|--------|--------|--------|-----|-----|------|-------|---------------|----|
+|halfcheetah-medium-v2|43.60 ± 0.14|43.90 ± 0.13|48.93 ± 0.11|50.81 ± 0.15|47.62 ± 0.03|48.84 ± 0.07|65.62 ± 0.46|72.21 ± 0.31| 69.72 ± 0.92  |42.73 ± 0.10|
+|halfcheetah-medium-replay-v2|40.52 ± 0.19|42.27 ± 0.46|45.84 ± 0.26|46.47 ± 0.26|46.43 ± 0.19|45.35 ± 0.08|52.22 ± 0.31|67.29 ± 0.34| 66.55 ± 1.05  |40.31 ± 0.28|
+|halfcheetah-medium-expert-v2|79.69 ± 3.10|94.11 ± 0.22|96.59 ± 0.87|96.83 ± 0.23|97.04 ± 0.17|95.38 ± 0.17|108.89 ± 1.20|111.73 ± 0.47| 110.62 ± 1.04 |93.40 ± 0.21|
+|hopper-medium-v2|69.04 ± 2.90|73.84 ± 0.37|70.44 ± 1.18|95.42 ± 3.67|70.80 ± 1.98|80.46 ± 3.09|103.19 ± 0.16|101.79 ± 0.20| 103.26 ± 0.14 |69.42 ± 3.64|
+|hopper-medium-replay-v2|68.88 ± 10.33|90.57 ± 2.07|98.12 ± 1.16|101.47 ± 0.23|101.63 ± 0.55|102.69 ± 0.96|102.57 ± 0.45|103.83 ± 0.53| 103.28 ± 0.49 |88.74 ± 3.02|
+|hopper-medium-expert-v2|90.63 ± 10.98|113.13 ± 0.16|113.22 ± 0.43|113.26 ± 0.49|112.84 ± 0.66|113.18 ± 0.38|113.16 ± 0.43|111.24 ± 0.15| 111.80 ± 0.11 |111.18 ± 0.21|
+|walker2d-medium-v2|80.64 ± 0.91|82.05 ± 0.93|86.91 ± 0.28|85.86 ± 3.76|84.77 ± 0.20|87.58 ± 0.48|87.79 ± 0.19|90.17 ± 0.54| 95.78 ± 1.07  |74.70 ± 0.56|
+|walker2d-medium-replay-v2|48.41 ± 7.61|76.09 ± 0.40|91.17 ± 0.72|86.70 ± 0.94|89.39 ± 0.88|89.94 ± 0.93|91.11 ± 0.63|85.18 ± 1.63| 89.69 ± 1.39  |68.22 ± 1.20|
+|walker2d-medium-expert-v2|109.95 ± 0.62|109.90 ± 0.09|112.21 ± 0.06|113.40 ± 2.22|111.63 ± 0.38|113.06 ± 0.53|112.49 ± 0.18|116.93 ± 0.42| 116.52 ± 0.75 |108.71 ± 0.34|
+| **locomotion average**       |    70.15|80.65|84.83|87.80|84.68|86.28|93.00|95.60| **96.36**     |77.49|
+
+
+### Maze2d
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC        |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|maze2d-umaze-v1|16.09 ± 0.87|22.49 ± 1.52|99.33 ± 16.16|136.96 ± 10.89|92.05 ± 13.66|50.92 ± 4.23| 162.28 ± 1.79 |153.12 ± 6.49|149.88 ± 1.97|63.83 ± 17.35|
+|maze2d-medium-v1|19.16 ± 1.24|27.64 ± 1.87|150.93 ± 3.89|152.73 ± 20.78|128.66 ± 5.44|122.69 ± 30.00| 150.12 ± 4.48 |93.80 ± 14.66|154.41 ± 1.58|68.14 ± 12.25|
+|maze2d-large-v1|20.75 ± 6.66|41.83 ± 3.64|197.64 ± 5.26|227.31 ± 1.47|157.51 ± 7.32|162.25 ± 44.18| 197.55 ± 5.82 |207.51 ± 0.96|182.52 ± 2.68|50.25 ± 19.34|
+|                    |            |        |        |     |     |      |               |      |    |  |
+| **maze2d average**           | 18.67|30.65|149.30|172.33|126.07|111.95| **169.98**    |151.48|162.27|60.74|
+
+### Antmaze
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC        |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|antmaze-umaze-v2|68.50 ± 2.29|77.50 ± 1.50|98.50 ± 0.87|70.75 ± 8.84|94.75 ± 0.83|84.00 ± 4.06| 100.00 ± 0.00 |0.00 ± 0.00|42.50 ± 28.61|64.50 ± 2.06|
+|antmaze-umaze-diverse-v2|64.75 ± 4.32|63.50 ± 2.18|71.25 ± 5.76|81.50 ± 4.27|53.75 ± 2.05|79.50 ± 3.35| 96.75 ± 2.28  |0.00 ± 0.00|0.00 ± 0.00|60.50 ± 2.29|
+|antmaze-medium-play-v2|4.50 ± 1.12|6.25 ± 2.38|3.75 ± 1.30|25.00 ± 10.70|80.50 ± 3.35|78.50 ± 3.84| 93.50 ± 2.60  |0.00 ± 0.00|0.00 ± 0.00|0.75 ± 0.43|
+|antmaze-medium-diverse-v2|4.75 ± 1.09|16.50 ± 5.59|5.50 ± 1.50|10.75 ± 5.31|71.00 ± 4.53|83.50 ± 1.80| 91.75 ± 2.05  |0.00 ± 0.00|0.00 ± 0.00|0.50 ± 0.50|
+|antmaze-large-play-v2|0.50 ± 0.50|13.50 ± 9.76|1.25 ± 0.43|0.50 ± 0.50|34.75 ± 5.85|53.50 ± 2.50| 68.75 ± 13.90 |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+|antmaze-large-diverse-v2|0.75 ± 0.43|6.25 ± 1.79|0.25 ± 0.43|0.00 ± 0.00|36.25 ± 3.34|53.00 ± 3.00| 69.50 ± 7.26  |0.00 ± 0.00|0.00 ± 0.00|0.00 ± 0.00|
+| **antmaze average**           |23.96|30.58|30.08|31.42|61.83|72.00| **86.71**     |0.00|7.08|21.04|
+
+### Adroit
+| **Task-Name**      |BC|10% BC|TD3+BC|AWAC|CQL|IQL| ReBRAC        |SAC-N|EDAC|DT|
+|--------------------|------------|--------|--------|--------|-----|-----|---------------|-------|------|----|
+|pen-human-v1|99.69 ± 7.45|59.89 ± 8.03|9.95 ± 8.19|119.03 ± 6.55|58.91 ± 1.81|106.15 ± 10.28| 127.28 ± 3.22 |56.48 ± 7.17|35.84 ± 10.57|77.83 ± 2.30|
+|pen-cloned-v1|99.14 ± 12.27|83.62 ± 11.75|52.66 ± 6.33|125.78 ± 3.28|14.74 ± 2.31|114.05 ± 4.78| 128.64 ± 7.15 |52.69 ± 5.30|26.90 ± 7.85|71.17 ± 2.70|
+|pen-expert-v1|128.77 ± 5.88|134.36 ± 3.16|142.83 ± 7.72|162.53 ± 0.30|14.86 ± 4.07|140.01 ± 6.36| 157.62 ± 0.26 |116.43 ± 40.26|36.04 ± 4.60|119.49 ± 2.31|
+|door-human-v1|9.41 ± 4.55|7.00 ± 6.77|-0.11 ± 0.06|17.70 ± 2.55|13.28 ± 2.77|13.52 ± 1.22| 0.27 ± 0.43   |-0.10 ± 0.06|2.51 ± 2.26|7.36 ± 1.24|
+|door-cloned-v1|3.40 ± 0.95|10.37 ± 4.09|-0.20 ± 0.11|10.53 ± 2.82|-0.08 ± 0.13|9.02 ± 1.47| 7.73 ± 6.80   |-0.21 ± 0.10|20.36 ± 1.11|11.18 ± 0.96|
+|door-expert-v1|105.84 ± 0.23|105.92 ± 0.24|4.49 ± 7.39|106.60 ± 0.27|59.47 ± 25.04|107.29 ± 0.37| 106.78 ± 0.04 |0.05 ± 0.02|109.22 ± 0.24|105.49 ± 0.09|
+|hammer-human-v1|12.61 ± 4.87|6.23 ± 4.79|2.38 ± 0.14|16.95 ± 3.61|0.30 ± 0.05|6.86 ± 2.38| 1.18 ± 0.15   |0.25 ± 0.00|3.49 ± 2.17|1.68 ± 0.11|
+|hammer-cloned-v1|8.90 ± 4.04|8.72 ± 3.28|0.96 ± 0.30|10.74 ± 5.54|0.32 ± 0.03|11.63 ± 1.70| 48.16 ± 6.20  |12.67 ± 15.02|0.27 ± 0.01|2.74 ± 0.22|
+|hammer-expert-v1|127.89 ± 0.57|128.15 ± 0.66|33.31 ± 47.65|129.08 ± 0.26|0.93 ± 1.12|129.76 ± 0.37| 134.74 ± 0.30 |91.74 ± 47.77|69.44 ± 47.00|127.39 ± 0.10|
+|relocate-human-v1|0.59 ± 0.27|0.16 ± 0.14|-0.29 ± 0.01|1.77 ± 0.84|1.03 ± 0.20|1.22 ± 0.28| 3.70 ± 2.34   |-0.18 ± 0.14|0.05 ± 0.02|0.08 ± 0.02|
+|relocate-cloned-v1|0.45 ± 0.31|0.74 ± 0.45|-0.02 ± 0.04|0.39 ± 0.13|-0.07 ± 0.02|1.78 ± 0.70| 9.25 ± 2.56   |0.10 ± 0.04|4.11 ± 1.39|0.34 ± 0.09|
+|relocate-expert-v1|110.31 ± 0.36|109.77 ± 0.60|0.23 ± 0.27|111.21 ± 0.32|0.03 ± 0.10|110.12 ± 0.82| 111.14 ± 0.23 |-0.07 ± 0.08|98.32 ± 3.75|106.49 ± 0.30|
+| **adroit average** | 58.92|54.58|20.51|67.69|13.65|62.62| **69.71**     |27.49|33.88|52.60|
+
+## Visual summary
+
+![](../assets/perf_profiles_offline.pdf)
\ No newline at end of file
diff --git a/docs/benchmarks/repro.md b/docs/benchmarks/repro.md
new file mode 100644
index 00000000..b90758fa
--- /dev/null
+++ b/docs/benchmarks/repro.md
@@ -0,0 +1,30 @@
+# How to Reproduce
+
+To reproduce all figures and tables from our [technical paper](https://arxiv.org/abs/2210.07105), do the following steps.
+
+## Collect wandb logs
+
+These scripts collect all wandb logs into .csv files and save them into the `runs_tables` folder. 
+We provide the tables, but you can recollect them.
+```python
+python results/get_offline_urls.py
+python results/get_finetune_urls.py
+```
+
+## Collect scores
+
+These scripts collect data from runs kept in .csv files and save evaluation scores (and regret in case of offline-to-online) 
+into pickled files, which are stored in the `bin` folder. We provide the pickled data, but if you need to extract more data,
+you can modify scripts for your purposes.
+```python
+python results/get_offline_scores.py
+python results/get_finetune_scores.py
+```
+
+## Print tables
+
+ These scripts use pickled data, print all the tables, and save all figures into the `out` directory.
+```python
+python results/get_offline_tables_and_plots.py
+python results/get_finetune_tables_and_plots.py
+```
\ No newline at end of file
diff --git a/docs/community/contrib.md b/docs/community/contrib.md
new file mode 100644
index 00000000..1f91155b
--- /dev/null
+++ b/docs/community/contrib.md
@@ -0,0 +1,116 @@
+# Contribution
+
+## Contributing to the codebase
+
+We welcome:
+
+- Bug reports
+- Pull requests for bug fixes
+- Logs and documentation improvements
+- New algorithms and datasets
+- Better hyperparameters (but with proofs)
+
+### Setup
+
+Contributing code is done through standard github methods:
+
+1. Fork this repo
+2. Make a change and commit your code
+3. Submit a pull request. It will be reviewed by maintainers, and they'll give feedback or make requests as applicable
+
+```commandline
+git clone git@github.com:tinkoff-ai/CORL.git
+cd CORL
+pip install -r requirements/requirements_dev.txt
+```
+
+For dependencies installation see [get started section](../get-started/install.md).
+
+### Code style
+
+The CI will run several checks on the new code pushed to the CORL repository. 
+These checks can also be run locally without waiting for the CI by following the steps below:
+    
+
+1. [install `pre-commit`](https://pre-commit.com/#install),
+2. install the Git hooks by running `pre-commit install`.
+
+Once those two steps are done, the Git hooks will be run automatically at every new commit. 
+The Git hooks can also be run manually with `pre-commit run --all-files`, and
+if needed they can be skipped (not recommended) with `git commit --no-verify`.
+
+We use [Ruff](https://github.com/astral-sh/ruff) as our main linter. If you want to see possible 
+problems before pre-commit, you can run `ruff check --diff .` to see exact linter suggestions and future fixes.
+
+## Adding new algorithms
+
+!!! warning
+        While we welcome any algorithms, it is better to open an issue with the proposal before 
+        so we can discuss the details. Unfortunately, not all algorithms are equally 
+        easy to understand and reproduce. We may be able to give a couple of advices to you,
+        or on the contrary warn you that this particular algorithm will require too much 
+        computational resources to fully reproduce the results, and it is better to do something else.
+
+
+All new algorithms should go to the `algorithms/contrib/offline` for just 
+offline algorithms and to the `algorithms/contrib/finetune` for the offline-to-online algorithms. 
+
+We as a team try to keep the core as reliable and reproducible as possible, 
+but we may not have the resources to support all future algorithms. 
+Therefore, this separation is necessary, as we cannot guarantee that all 
+algorithms from `algorithms/contrib` exactly reproduce the results of their original publications.
+
+Make sure your new code is properly documented and all references to the original implementations and papers are present
+(for example as in [Decision Transformer](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py)). 
+Follow the conventions for naming argument of configs, functions, classes. Try to stylistically imitate already existing implementations.
+
+Please, **explain all the tricks and possible differences from the original implementation in as much detail as possible**. 
+Keep in mind that this code may be used by other researchers. Make their lives easier!
+
+### Running benchmarks
+
+Although you will have to do a hyperparameter search while reproducing the algorithm, 
+in the end we expect to see final configs in `configs/contrib/<algo_type>/<algo_name>/<dataset_name>.yaml` with the best hyperparameters for all 
+datasets considered. The configs should be in `yaml` format, containing all hyperparameters sorted 
+in alphabetical order (see existing configs for an inspiration).
+
+Use these conventions to name your runs in the configs:
+1. `name: <algo_name>`
+2. `group: <algo_name>-<dataset_name>-multiseed-v0`, increment version if needed
+3. use our [\_\_post_init\_\_](https://github.com/tinkoff-ai/CORL/blob/962688b405f579a1ce6ec1b57e6369aaf76f9e69/algorithms/offline/awac.py#L48) implementation in your config dataclass
+
+Since we are releasing wandb logs for all algorithms, you will need to submit multiseed (~4 seeds) 
+training runs the `CORL` project in the wandb [corl-team](https://wandb.ai/corl-team) organization. We'll invite you there when the time will come.
+
+We usually use wandb sweeps for this. You can use this example config (it will work with pyrallis as it expects `config_path` cli argument):
+```yaml title="sweep_config.yaml"
+entity: corl-team
+project: CORL
+program: algorithms/contrib/<algo_name>.py
+method: grid
+parameters:
+  config_path:
+    # algo_type is offline or finetune (see sections above)
+    values: [
+        "configs/contrib/<algo_type>/<algo_name>/<dataset_name_1>.yaml",
+        "configs/contrib/<algo_type>/<algo_name>/<dataset_name_2>.yaml",
+        "configs/contrib/<algo_type>/<algo_name>/<dataset_name_3>.yaml",
+    ]
+  train_seed:
+    values: [0, 1, 2, 3]
+```
+Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent <agent_id>`.
+
+Based on the results, you will need to make wandb reports to make it easier for other users to understand. 
+You can use any of the already existing ones as an example (see [README.md](https://github.com/corl-team/CORL/tree/main)).
+
+### Checklist
+
+Ideally, all checks should be completed!
+
+- [ ] Issue about new algorithm is open
+- [ ] Single-file implementation is added to the `algorithms/contrib`
+- [ ] PR has passed all the tests
+- [ ] Evidence that implementation reproduces original results is provided
+- [ ] Configs with the best hyperparameters for all datasets are added to the `configs/contrib`
+- [ ] Logs and reports for best hyperparameters are submitted to our wandb organization
diff --git a/docs/community/publications.md b/docs/community/publications.md
new file mode 100644
index 00000000..cabc4960
--- /dev/null
+++ b/docs/community/publications.md
@@ -0,0 +1,28 @@
+# List of Publications
+
+!!! tip
+        Please open a pull request to add missing entries!
+
+List of publications that are using CORL algorithms or benchmarked results:
+
+- Lu, C., Ball, P. J., & Parker-Holder, J. Synthetic Experience Replay.
+- Beeson, A., & Montana, G. (2023). Balancing policy constraint and ensemble size in uncertainty-based offline reinforcement learning. arXiv preprint arXiv:2303.14716.
+- Nikulin, A., Kurenkov, V., Tarasov, D., & Kolesnikov, S. (2023). Anti-exploration by random network distillation. arXiv preprint arXiv:2301.13616.
+- Bhargava, P., Chitnis, R., Geramifard, A., Sodhani, S., & Zhang, A. (2023). Sequence Modeling is a Robust Contender for Offline Reinforcement Learning. arXiv preprint arXiv:2305.14550.
+- Hu, X., Ma, Y., Xiao, C., Zheng, Y., & Meng, Z. (2023). In-Sample Policy Iteration for Offline Reinforcement Learning. arXiv preprint arXiv:2306.05726.
+- Lian, S., Ma, Y., Liu, J., Zheng, Y., & Meng, Z. (2023). HIPODE: Enhancing Offline Reinforcement Learning with High-Quality Synthetic Data from a Policy-Decoupled Approach. arXiv preprint arXiv:2306.06329.
+- He, H., Bai, C., Xu, K., Yang, Z., Zhang, W., Wang, D., ... & Li, X. (2023). Diffusion Model is an Effective Planner and Data Synthesizer for Multi-Task Reinforcement Learning. arXiv preprint arXiv:2305.18459.
+- Liu, J., Ma, Y., Hao, J., Hu, Y., Zheng, Y., Lv, T., & Fan, C. (2023). Prioritized Trajectory Replay: A Replay Memory for Data-driven Reinforcement Learning. arXiv preprint arXiv:2306.15503.
+- Chitnis, R., Xu, Y., Hashemi, B., Lehnert, L., Dogan, U., Zhu, Z., & Delalleau, O. (2023). IQL-TD-MPC: Implicit Q-Learning for Hierarchical Model Predictive Control. arXiv preprint arXiv:2306.00867.
+- Kurenkov, V., Nikulin, A., Tarasov, D., & Kolesnikov, S. (2023). Katakomba: Tools and Benchmarks for Data-Driven NetHack. arXiv preprint arXiv:2306.08772.
+- Lian, S., Ma, Y., Liu, J., Jianye, H. A. O., Zheng, Y., & Meng, Z. (2023, July). A Policy-Decoupled Method for High-Quality Data Augmentation in Offline Reinforcement Learning. In ICML Workshop on New Frontiers in Learning, Control, and Dynamical Systems.
+
+
+
+
+
+
+
+
+
+            
\ No newline at end of file
diff --git a/docs/get-started/install.md b/docs/get-started/install.md
new file mode 100644
index 00000000..3b788d3d
--- /dev/null
+++ b/docs/get-started/install.md
@@ -0,0 +1,38 @@
+# Installation
+
+## Manual
+!!! warning
+        Unfortunately, installing all dependencies can cause some difficulties at the moment, mainly due to **D4RL** and 
+        the old version of mujoco it is locked to. It will be much easier in the future after migration to the **Minari** is done.
+
+All necessary dependencies are specified in the [`requirements/requirements.txt`](https://github.com/corl-team/CORL/blob/main/requirements/requirements.txt) file. 
+You can just clone the repo and install all dependencies with pip: 
+```commandline
+git clone https://github.com/corl-team/CORL.git
+cd CORL
+pip install -r requirements/requirements.txt
+```
+
+In addition to those specified there, the dependencies required by D4RL, namely MuJoCo binaries, must also be installed.
+We recommend following the official guide from [**mujoco-py**](https://github.com/openai/mujoco-py). You will need to download
+MuJoCo 2.1 binaries and extract downloaded `mujoco210` directory to the `~/.mujoco/mujoco210`:
+```commandline
+mkdir -p ~/.mujoco \
+    && wget https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz \
+    && tar -xf mujoco.tar.gz -C ~/.mujoco \
+    && rm mujoco.tar.gz
+export LD_LIBRARY_PATH=~/.mujoco/mujoco210/bin:${LD_LIBRARY_PATH}
+```
+If you have any problems with the installation, we advise you to first look for similar issues in the 
+original [**D4RL**](https://github.com/Farama-Foundation/D4RL) and [**mujoco-py**](https://github.com/openai/mujoco-py) repositories.
+Most likely problem is in **D4RL**, not in **CORL** :smile:
+
+## Docker
+
+To simplify installation and improve reproducibility, we provide a preconfigured
+[Dockerfile](https://github.com/corl-team/CORL/blob/main/Dockerfile) that you can use:
+```bash
+cd CORL
+docker build -t corl .
+docker run --gpus=all -it --rm --name corl-container corl
+```
\ No newline at end of file
diff --git a/docs/get-started/usage.md b/docs/get-started/usage.md
new file mode 100644
index 00000000..e65b71c0
--- /dev/null
+++ b/docs/get-started/usage.md
@@ -0,0 +1,157 @@
+# Basic Usage
+
+![corl_tldr](../assets/corl.pdf)
+
+## How to Train
+
+We use [pyrallis](https://github.com/eladrich/pyrallis) for the configuration, thus after the dependencies have been installed, 
+there are two ways to run the CORL algorithms:
+
+1. Manually specifying all the arguments within the terminal (they will overwrite the default ones):
+```commandline
+python algorithms/offline/dt.py \
+    --project="CORL-Test" \
+    --group="DT-Test" \
+    --name="dt-testing-run" \
+    --env_name="halfcheetah-medium-v2" \
+    --device="cuda:0"
+    # etc...
+```
+
+2. With yaml config. First, create yaml file with all needed hyperparameters:
+```yaml title="dt_example_config.yaml"
+# taken from https://github.com/corl-team/CORL/blob/main/configs/offline/dt/halfcheetah/medium_v2.yaml
+attention_dropout: 0.1
+batch_size: 4096
+betas:
+- 0.9
+- 0.999
+checkpoints_path: null
+clip_grad: 0.25
+deterministic_torch: false
+device: cuda
+embedding_dim: 128
+embedding_dropout: 0.1
+env_name: "halfcheetah-medium-v2"
+episode_len: 1000
+eval_episodes: 100
+eval_every: 5000
+eval_seed: 42
+group: "dt-halfcheetah-medium-v2-multiseed-v2"
+learning_rate: 0.0008
+max_action: 1.0
+name: "DT"
+num_heads: 1
+num_layers: 3
+num_workers: 4
+project: "CORL"
+residual_dropout: 0.1
+reward_scale: 0.001
+seq_len: 20
+target_returns: [12000.0, 6000.0]
+train_seed: 10
+update_steps: 100000
+warmup_steps: 10000
+weight_decay: 0.0001
+```
+After that we can supply all hyperparameters from config with `config_path` argument:
+```commandline
+python algorithms/offline/dt.py \
+    --config_path="dt_example_config.yaml"
+    # you can also overwrite any hyperparameter if needed
+    --device="cuda:0"
+    # etc...
+```
+By default, training script will log metrics to the wandb project specified by the `group` argument. 
+If you want to disable logging, run `wandb disabled` or `wandb offline`. To turn it back on, run `wandb online`. 
+For more options see [wandb documentation](https://docs.wandb.ai/guides/technical-faq/general#can-i-disable-wandb-when-testing-my-code).    
+
+    If you're not familiar with [Weights & Biases](https://wandb.ai/site) logging tools, it is better to first familiarize 
+    yourself with the basics [here](https://docs.wandb.ai/quickstart). 
+
+    For an explanation of all logged metrics, refer to the documentation of the specific algorithm.
+
+## CLI Documentation
+
+How to find out all available hyperparameters and their brief explanation? Very simple, just run `python algorithms/offline/dt.py --help` (this will work for all algorithms):
+```commandline
+usage: dt.py [-h] [--config_path str] [--project str] [--group str] [--name str] [--embedding_dim int] [--num_layers int]
+             [--num_heads int] [--seq_len int] [--episode_len int] [--attention_dropout float] [--residual_dropout float]
+             [--embedding_dropout float] [--max_action float] [--env_name str] [--learning_rate float]
+             [--betas float float] [--weight_decay float] [--clip_grad [float]] [--batch_size int] [--update_steps int]
+             [--warmup_steps int] [--reward_scale float] [--num_workers int] [--target_returns float [float, ...]]
+             [--eval_episodes int] [--eval_every int] [--checkpoints_path [str]] [--deterministic_torch bool]
+             [--train_seed int] [--eval_seed int] [--device str]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config_path str     Path for a config file to parse with pyrallis (default: None)
+
+TrainConfig:
+
+  --project str         wandb project name (default: CORL)
+  --group str           wandb group name (default: DT-D4RL)
+  --name str            wandb run name (default: DT)
+  --embedding_dim int   transformer hidden dim (default: 128)
+  --num_layers int      depth of the transformer model (default: 3)
+  --num_heads int       number of heads in the attention (default: 1)
+  --seq_len int         maximum sequence length during training (default: 20)
+  --episode_len int     maximum rollout length, needed for the positional embeddings (default: 1000)
+  --attention_dropout float
+                        attention dropout (default: 0.1)
+  --residual_dropout float
+                        residual dropout (default: 0.1)
+  --embedding_dropout float
+                        embeddings dropout (default: 0.1)
+  --max_action float    maximum range for the symmetric actions, [-1, 1] (default: 1.0)
+  --env_name str        training dataset and evaluation environment (default: halfcheetah-medium-v2)
+  --learning_rate float
+                        AdamW optimizer learning rate (default: 0.0001)
+  --betas float float   AdamW optimizer betas (default: (0.9, 0.999))
+  --weight_decay float  AdamW weight decay (default: 0.0001)
+  --clip_grad [float]   maximum gradient norm during training, optional (default: 0.25)
+  --batch_size int      training batch size (default: 64)
+  --update_steps int    total training steps (default: 100000)
+  --warmup_steps int    warmup steps for the learning rate scheduler (increasing from zero to learning_rate) (default:
+                        10000)
+  --reward_scale float  reward scaling, to reduce the magnitude (default: 0.001)
+  --num_workers int     number of workers for the pytorch dataloader (default: 4)
+  --target_returns float [float, ...]
+                        target return-to-go for the prompting durint evaluation (default: (12000.0, 6000.0))
+  --eval_episodes int   number of episodes to run during evaluation (default: 100)
+  --eval_every int      evaluation frequency, will evaluate eval_every training steps (default: 10000)
+  --checkpoints_path [str]
+                        path for checkpoints saving, optional (default: None)
+  --deterministic_torch bool
+                        configure PyTorch to use deterministic algorithms instead of nondeterministic ones where available
+                        (default: False)
+  --train_seed int      training random seed (default: 10)
+  --eval_seed int       evaluation random seed (default: 42)
+  --device str          training device (default: cuda)
+```
+
+## Benchmarking
+
+Sooner or later you will probably want to run many experiments at once, for example to search for hyperparameters, 
+or to do multi-seed training for some datasets. For something like this we recommend using wandb sweeps (and we use them ourselves). 
+The general recipe looks like this. First, create wandb seep config:
+```yaml title="sweep_config.yaml"
+entity: corl-team
+project: CORL
+program: algorithms/offline/dt.py
+method: grid
+parameters:
+  # specify all configs to run for the choosen algorithm
+  config_path:
+    values: [
+        "configs/offline/dt/halfcheetah/medium_v2.yaml",
+        "configs/offline/dt/halfcheetah/medium_replay_v2.yaml",
+        "configs/offline/dt/halfcheetah/medium_expert_v2.yaml",
+    ]
+  train_seed:
+    values: [0, 1, 2, 3]
+```
+Then proceed as usual. Create wandb sweep with `wandb sweep sweep_config.yaml`, then run agents with `wandb agent <agent_id>`. 
+This will train multiple seeds for each config.
+
+All configs with full hyperparameters for all datasets and algorithms are in [`configs`](https://github.com/corl-team/CORL/tree/main/configs).
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..cb337970
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,77 @@
+---
+hide:
+  - toc        # Hide table of contents
+---
+
+# CORL (Clean Offline Reinforcement Learning)
+
+[![Twitter](https://badgen.net/badge/icon/twitter?icon=twitter&label)](https://twitter.com/vladkurenkov/status/1669361090550177793)
+[![arXiv](https://img.shields.io/badge/arXiv-2210.07105-b31b1b.svg)](https://arxiv.org/abs/2210.07105)
+[<img src="https://img.shields.io/badge/license-Apache_2.0-blue">](https://github.com/tinkoff-ai/CORL/blob/main/LICENSE)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+
+🧵 CORL is an Offline Reinforcement Learning library that provides high-quality and easy-to-follow single-file implementations 
+of SOTA **offline reinforcement learning** algorithms. Each implementation is backed by a research-friendly codebase, allowing 
+you to run or tune thousands of experiments. Heavily inspired by [cleanrl](https://github.com/vwxyzjn/cleanrl) for online RL,
+check them out too! The highlight features of CORL are:<br/>
+
+* 📜 Single-file implementation
+* 📈 Benchmarked Implementation (11+ offline algorithms, 5+ offline-to-online algorithms, 30+ datasets with detailed logs :material-arm-flex:)
+* 🖼 [Weights and Biases](https://wandb.ai/site) integration
+
+You can read more about CORL design and main results in our [technical paper](https://arxiv.org/abs/2210.07105).
+
+
+!!! tip
+        ⭐ If you're interested in __discrete control__, make sure to check out our new library — [Katakomba](https://github.com/corl-team/katakomba). It provides both discrete control algorithms augmented with recurrence and an offline RL benchmark for the NetHack Learning environment.
+
+
+!!! info
+        **Minari** and **Gymnasium** support: [Farama-Foundation/Minari](https://github.com/Farama-Foundation/Minari) is the
+        next generation of D4RL that will continue to be maintained and introduce new features and datasets. 
+        Please see their [announcement](https://farama.org/Announcing-Minari) for further detail. 
+        We are currently slowly migrating to the Minari and the progress
+        can be tracked [here](https://github.com/corl-team/CORL/issues/2). This will allow us to significantly update dependencies 
+        and simplify installation, and give users access to many new datasets out of the box!
+
+
+!!! warning
+        CORL (similarily to CleanRL) is not a modular library and therefore it is not meant to be imported.
+        At the cost of duplicate code, we make all implementation details of an ORL algorithm variant easy 
+        to understand. You should consider using CORL if you want to 1) understand and control all implementation details 
+        of an algorithm or 2) rapidly prototype advanced features that other modular ORL libraries do not support.
+
+
+## Algorithms Implemented
+
+| Algorithm                                                                                                                      | Variants Implemented                                                                                                                                                                                                                                                                           | Wandb Report |
+|--------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- |
+| **Offline and Offline-to-Online**                                                                                              |                                                                                                                                                                                                                                                                                                |
+| ✅ [Conservative Q-Learning for Offline Reinforcement Learning <br>(CQL)](https://arxiv.org/abs/2006.04779)                     | :material-github: [`offline/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/cql.py) <br /> :material-github: [`finetune/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cql.py) <br /> :material-file-document: [docs](algorithms/cql.md)     | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-CQL--VmlldzoyNzA2MTk5) <br /> :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-CQL--Vmlldzo0NTQ3NTMz)
+| ✅ [Accelerating Online Reinforcement Learning with Offline Datasets <br>(AWAC)](https://arxiv.org/abs/2006.09359)              | :material-github: [`offline/awac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/awac.py) <br /> :material-github: [`finetune/awac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/awac.py) <br /> :material-file-document: [docs](algorithms/awac.md) | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-AWAC--VmlldzoyNzA2MjE3) <br /> :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-AWAC--VmlldzozODAyNzQz)
+| ✅ [Offline Reinforcement Learning with Implicit Q-Learning <br>(IQL)](https://arxiv.org/abs/2110.06169)                        | :material-github: [`offline/iql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/iql.py)  <br /> :material-github: [`finetune/iql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/iql.py) <br /> :material-file-document: [docs](algorithms/iql.md)    |:material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-IQL--VmlldzoyNzA2MTkx) <br /> :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-IQL--VmlldzozNzE1MTEy)
+| **Offline-to-Online only**                                                                                                     |                                                                                                                                                                                                                                                                                                |
+| ✅ [Supported Policy Optimization for Offline Reinforcement Learning <br>(SPOT)](https://arxiv.org/abs/2202.06239)              | :material-github: [`finetune/spot.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/spot.py) <br /> :material-file-document: [docs](algorithms/spot.md)                                                                                                                     | :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-SPOT--VmlldzozODk5MTgx)
+| ✅ [Cal-QL: Calibrated Offline RL Pre-Training for Efficient Online Fine-Tuning <br>(Cal-QL)](https://arxiv.org/abs/2303.05479) | :material-github: [`finetune/cal_ql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cal_ql.py) <br /> :material-file-document: [docs](algorithms/cal-ql.md)                                                                                                               | :material-chart-box: [`Offline-to-online`](https://wandb.ai/tlab/CORL/reports/-Offline-to-Online-Cal-QL--Vmlldzo0NTQ3NDk5)
+| **Offline only**                                                                                                               |                                                                                                                                                                                                                                                                                                |
+| ✅ Behavioral Cloning <br>(BC)                                                                                                  | :material-github: [`offline/any_percent_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/any_percent_bc.py) <br /> :material-file-document: [docs](algorithms/bc.md)                                                                                                     |  :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC--VmlldzoyNzA2MjE1)
+| ✅ Behavioral Cloning-10% <br>(BC-10%)                                                                                          | :material-github: [`offline/any_percent_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/any_percent_bc.py) <br /> :material-file-document: [docs](algorithms/bc.md)                                                                                                     |  :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-BC-10---VmlldzoyNzEwMjcx)
+| ✅ [A Minimalist Approach to Offline Reinforcement Learning <br>(TD3+BC)](https://arxiv.org/abs/2106.06860)                     | :material-github: [`offline/td3_bc.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/td3_bc.py) <br /> :material-file-document: [docs](algorithms/td3-bc.md)                                                                                                                 | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-TD3-BC--VmlldzoyNzA2MjA0)
+| ✅ [Decision Transformer: Reinforcement Learning via Sequence Modeling <br>(DT)](https://arxiv.org/abs/2106.01345)              | :material-github: [`offline/dt.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/dt.py) <br /> :material-file-document: [docs](algorithms/dt.md)                                                                                                                             | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-Decision-Transformer--VmlldzoyNzA2MTk3)
+| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble <br>(SAC-N)](https://arxiv.org/abs/2110.01548) | :material-github: [`offline/sac_n.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/sac_n.py) <br /> :material-file-document: [docs](algorithms/sac-n.md)                                                                                                                    | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-SAC-N--VmlldzoyNzA1NTY1)
+| ✅ [Uncertainty-Based Offline Reinforcement Learning with Diversified Q-Ensemble <br>(EDAC)](https://arxiv.org/abs/2110.01548)  | :material-github: [`offline/edac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/edac.py) <br /> :material-file-document: [docs](algorithms/edac.md)                                                                                                                       | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-EDAC--VmlldzoyNzA5ODUw)
+| ✅ [Revisiting the Minimalist Approach to Offline Reinforcement Learning <br>(ReBRAC)](https://arxiv.org/abs/2305.09836)        | :material-github: [`offline/rebrac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/rebrac.py) <br /> :material-file-document: [docs](algorithms/rebrac.md)                                                                                                                 | :material-chart-box: [`Offline`](https://wandb.ai/tlab/CORL/reports/-Offline-ReBRAC--Vmlldzo0ODkzOTQ2)
+| ✅ [Q-Ensemble for Offline RL: Don't Scale the Ensemble, Scale the Batch Size <br>(LB-SAC)](https://arxiv.org/abs/2211.11092)   | :material-github: [`offline/lb_sac.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/lb_sac.py) <br /> :material-file-document: [docs](algorithms/lb-sac.md)                                                                                                                 | :material-chart-box: [`Offline Gym-MuJoCo`](https://wandb.ai/tlab/CORL/reports/LB-SAC-D4RL-Results--VmlldzozNjIxMDY1)
+
+## Citing CORL
+If you use CORL in your work, please use the following bibtex
+```bibtex
+@inproceedings{
+tarasov2022corl,
+  title={CORL: Research-oriented Deep Offline Reinforcement Learning Library},
+  author={Denis Tarasov and Alexander Nikulin and Dmitry Akimov and Vladislav Kurenkov and Sergey Kolesnikov},
+  booktitle={3rd Offline RL Workshop: Offline RL as a ''Launchpad''},
+  year={2022},
+  url={https://openreview.net/forum?id=SyAS49bBcv}
+}
+```
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 00000000..bda5b719
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,85 @@
+site_name: Clean Offline RL
+theme:
+  name: material
+  logo: assets/logo.jpeg
+  palette:
+    # Palette toggle for light mode
+    - scheme: default
+      toggle:
+        icon: material/toggle-switch
+        name: Switch to dark mode
+
+    # Palette toggle for dark mode
+    - scheme: slate
+      toggle:
+        icon: material/toggle-switch-off-outline
+        name: Switch to light mode
+  features:
+    - navigation.instant
+    - navigation.tracking
+    - navigation.sections
+    - navigation.expand
+    - navigation.path
+#    - toc.integrate
+    - navigation.top
+    - search.suggest
+    - search.highlight
+    - header.autohide
+    - content.code.copy
+    - content.code.annotate
+
+copyright: Copyright &copy; 2022, CORL Team
+
+repo_url: https://github.com/corl-team/CORL
+repo_name: corl-team/CORL
+
+markdown_extensions:
+  - meta
+  - admonition
+  - pymdownx.details
+  - pymdownx.superfences
+  - pymdownx.emoji:
+        emoji_index: !!python/name:materialx.emoji.twemoji
+        emoji_generator: !!python/name:materialx.emoji.to_svg
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.tasklist:
+      custom_checkbox: true
+      clickable_checkbox: false
+  - pymdownx.arithmatex:
+      generic: true
+
+extra_javascript:
+  - javascripts/mathjax.js
+  - https://polyfill.io/v3/polyfill.min.js?features=es6
+  - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
+
+nav:
+  - Overview: index.md
+  - Get Started:
+      - get-started/install.md
+      - get-started/usage.md
+  - Benchmarks:
+      - benchmarks/offline.md
+      - benchmarks/offline-to-online.md
+      - benchmarks/repro.md
+  - Algorithms:
+      - algorithms/bc.md
+      - algorithms/td3-bc.md
+      - algorithms/dt.md
+      - algorithms/sac-n.md
+      - algorithms/edac.md
+      - algorithms/rebrac.md
+      - algorithms/lb-sac.md
+      - algorithms/cql.md
+      - algorithms/awac.md
+      - algorithms/iql.md
+      - algorithms/cal-ql.md
+      - algorithms/spot.md
+  - Community:
+      - community/contrib.md
+      - community/publications.md
\ No newline at end of file
diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt
index e0e489b0..16d0cc05 100644
--- a/requirements/requirements_dev.txt
+++ b/requirements/requirements_dev.txt
@@ -10,6 +10,7 @@ torch==1.11.0+cu113
 pyrallis==0.3.1
 pre-commit==3.3.3
 ruff==0.0.278
+mkdocs-material==9.1.21
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 jax==0.4.1
 jaxlib[cuda11_cudnn82]==0.4.1
diff --git a/results/get_offline_scores.py b/results/get_offline_scores.py
index 0e165a97..fe5e8570 100644
--- a/results/get_offline_scores.py
+++ b/results/get_offline_scores.py
@@ -44,7 +44,11 @@ def process_runs(df):
         df.iterrows(), desc="Runs scores downloading", position=0, leave=True
     ):
         full_scores[row["algorithm"]][row["dataset"]].append(
-            get_run_scores(row["url"], row["algorithm"] == "DT", row["algorithm"] == "AWAC")
+            get_run_scores(
+                row["url"],
+                row["algorithm"] == "DT",
+                row["algorithm"] == "AWAC"
+            )
         )
     return full_scores