diff --git a/algorithms/finetune/cal_ql.py b/algorithms/finetune/cal_ql.py index d402e1b9..4326489b 100644 --- a/algorithms/finetune/cal_ql.py +++ b/algorithms/finetune/cal_ql.py @@ -68,9 +68,9 @@ class TrainConfig: mixing_ratio: float = 0.5 # Data mixing ratio for online tuning is_sparse_reward: bool = False # Use sparse reward # Wandb logging - project: str = "CORL" - group: str = "Cal-QL-D4RL" - name: str = "Cal-QL" + project: str = "CORL" # wandb project name + group: str = "Cal-QL-D4RL" # wandb group name + name: str = "Cal-QL" # wandb run name def __post_init__(self): self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" diff --git a/algorithms/offline/cql.py b/algorithms/offline/cql.py index 307ffc08..962d1a15 100644 --- a/algorithms/offline/cql.py +++ b/algorithms/offline/cql.py @@ -56,15 +56,16 @@ class TrainConfig: q_n_hidden_layers: int = 3 # Number of hidden layers in Q networks reward_scale: float = 1.0 # Reward scale for normalization reward_bias: float = 0.0 # Reward bias for normalization - bc_steps: int = int(0) # Number of BC steps at start (AntMaze hacks) - reward_scale: float = 5.0 - reward_bias: float = -1.0 - policy_log_std_multiplier: float = 1.0 + + bc_steps: int = int(0) # Number of BC steps at start + reward_scale: float = 5.0 # Reward scale for normalization + reward_bias: float = -1.0 # Reward bias for normalization + policy_log_std_multiplier: float = 1.0 # Stochastic policy std multiplier # Wandb logging - project: str = "CORL" - group: str = "CQL-D4RL" - name: str = "CQL" + project: str = "CORL" # wandb project name + group: str = "CQL-D4RL" # wandb group name + name: str = "CQL" # wandb run name def __post_init__(self): self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}" diff --git a/docs/algorithms/cal-ql.md b/docs/algorithms/cal-ql.md index 798de1ef..f0cafbe9 100644 --- a/docs/algorithms/cal-ql.md +++ b/docs/algorithms/cal-ql.md @@ -1 +1,137 @@ # Cal-QL + +## Overview + +The Calibrated Q-Learning (Cal-QL) is a modification for offline Actor Critic algorithms which aims to improve their offline-to-online transfer. +Offline RL algorithms which try to minimize the out-of-distribution values for Q function may lower this value to much which lead to unlearning at early finetuning steps. +Originally it was proposed for CQL and our implementation also builds upon it. + +In order to resolve the problem with unrealistically low Q values the following change to the critic loss function is done (change in blue) + +$$ +\min _{\phi_i} \mathbb{E}_{\mathbf{s}, \mathbf{a}, \mathbf{s}^{\prime} \sim \mathcal{D}}\left[\left(Q_{\phi_i}(\mathbf{s}, \mathbf{a})-\left(r(\mathbf{s}, \mathbf{a})+\gamma \mathbb{E}_{\mathbf{a}^{\prime} \sim \pi_\theta\left(\cdot \mid \mathbf{s}^{\prime}\right)}\left[\min _{j=1, 2}} Q_{\phi_j^{\prime}}\left(\mathbf{s}^{\prime}, \mathbf{a}^{\prime}\right)-\alpha \log \pi_\theta\left(\mathbf{a}^{\prime} \mid \mathbf{s}^{\prime}\right)\right]\right)\right)^2\right] + {\mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \mathcal{\mu(a | s)}}\left[{\color{blue}\max(Q_{\phi_i^{\prime}}(s, a), V^\nu(s))\right] - \mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \mathcal{\hat{\pi}_\beta(a | s)}}\left[Q_{\phi_i^{\prime}}(s, a)\right]} +$$ +where $V^\nu(s)$ is value function approximation. Simple return-to-go calculated from the dataset is used for this purpose. + +Original paper: + +* [Cal-QL: Calibrated Offline RL Pre-Training for Efficient Online Fine-Tuning](https://arxiv.org/abs/2303.05479) + +Reference resources: + +* :material-github: [Official codebase for Cal-QL](https://github.com/nakamotoo/Cal-QL) + + +!!! warning + Cal-QL is originally based on CQL and inherits all the weaknesses which CQL has (e.g. slow training or hyperparameters sensitivity). + +!!! warning + Cal-QL performs worse in offline setup than some of the other algorithms but finetunes much better. + +!!! success + Cal-QL is the state-of-the-art solution for challenging AntMaze domain in offline-to-online domain. + + +## Implemented Variants + +| Variants Implemented | Description | +|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------| +| :material-github: [`finetune/cal_ql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cal_ql.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/finetune/cal_ql) | For continuous action spaces and offline-to-online RL. | + + +## Explanation of some of logged metrics + +* `policy_loss`: mean actor loss. +* `alpha_loss`: mean SAC entropy loss. +* `qf{i}_loss`: mean i-th critic loss. +* `cql_q{i}_{next_actions, rand}`: Q mean values of i-th critic for next or random actions. +* `d4rl_normalized_score`: mean evaluation normalized score. Should be between 0 and 100, where 100+ is the + performance above expert for this environment. Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)]. + +## Implementation details (for more see CQL) + +1. Return-to-go calculation (:material-github: [algorithms/finetune/cal_ql.py#L275](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/finetune/cal_ql.py#L275)) +2. Offline and online data constant proportion (:material-github: [algorithms/finetune/cal_ql.py#L1187](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/finetune/cal_ql.py#LL1187)) + +## Experimental results + +For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md). +Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly. + + + +## Training options + +```commandline +usage: cal_ql.py [-h] [--config_path str] [--device str] [--env str] [--seed str] [--eval_seed str] [--eval_freq str] [--n_episodes str] [--offline_iterations str] [--online_iterations str] [--checkpoints_path str] + [--load_model str] [--buffer_size str] [--batch_size str] [--discount str] [--alpha_multiplier str] [--use_automatic_entropy_tuning str] [--backup_entropy str] [--policy_lr str] [--qf_lr str] + [--soft_target_update_rate str] [--bc_steps str] [--target_update_period str] [--cql_alpha str] [--cql_alpha_online str] [--cql_n_actions str] [--cql_importance_sample str] [--cql_lagrange str] + [--cql_target_action_gap str] [--cql_temp str] [--cql_max_target_backup str] [--cql_clip_diff_min str] [--cql_clip_diff_max str] [--orthogonal_init str] [--normalize str] [--normalize_reward str] + [--q_n_hidden_layers str] [--reward_scale str] [--reward_bias str] [--mixing_ratio str] [--is_sparse_reward str] [--project str] [--group str] [--name str] + +options: + -h, --help show this help message and exit + --config_path str Path for a config file to parse with pyrallis + +TrainConfig: + + --device str Experiment + --env str OpenAI gym environment name + --seed str Sets Gym, PyTorch and Numpy seeds + --eval_seed str Eval environment seed + --eval_freq str How often (time steps) we evaluate + --n_episodes str How many episodes run during evaluation + --offline_iterations str + Number of offline updates + --online_iterations str + Number of online updates + --checkpoints_path str + Save path + --load_model str Model load file name, "" doesn't load + --buffer_size str CQL + --batch_size str Batch size for all networks + --discount str Discount factor + --alpha_multiplier str + Multiplier for alpha in loss + --use_automatic_entropy_tuning str + Tune entropy + --backup_entropy str Use backup entropy + --policy_lr str Policy learning rate + --qf_lr str Critics learning rate + --soft_target_update_rate str + Target network update rate + --bc_steps str Number of BC steps at start + --target_update_period str + Frequency of target nets updates + --cql_alpha str CQL offline regularization parameter + --cql_alpha_online str + CQL online regularization parameter + --cql_n_actions str Number of sampled actions + --cql_importance_sample str + Use importance sampling + --cql_lagrange str Use Lagrange version of CQL + --cql_target_action_gap str + Action gap + --cql_temp str CQL temperature + --cql_max_target_backup str + Use max target backup + --cql_clip_diff_min str + Q-function lower loss clipping + --cql_clip_diff_max str + Q-function upper loss clipping + --orthogonal_init str + Orthogonal initialization + --normalize str Normalize states + --normalize_reward str + Normalize reward + --q_n_hidden_layers str + Number of hidden layers in Q networks + --reward_scale str Reward scale for normalization + --reward_bias str Reward bias for normalization + --mixing_ratio str Cal-QL + --is_sparse_reward str + Use sparse reward + --project str Wandb logging + --group str wandb group name + --name str wandb run name +``` diff --git a/docs/algorithms/cql.md b/docs/algorithms/cql.md index 194cc7a1..db348981 100644 --- a/docs/algorithms/cql.md +++ b/docs/algorithms/cql.md @@ -1 +1,151 @@ -# CQL \ No newline at end of file +# CQL + +## Overview + +The Conservative Q-Learning (CQL) is one of the most popular offline RL frameworks. +It is originally build upon the Soft Actor Critic (SAC) but can be transferred to any other method which employs Q function. +The core idea behind CQL is to approximate Q values for state-action pairs within dataset and to minimize this value for out-of-distribution pairs. + +This idea can be achieved with the following critic loss (change in blue) +$$ +\min _{\phi_i} \mathbb{E}_{\mathbf{s}, \mathbf{a}, \mathbf{s}^{\prime} \sim \mathcal{D}}\left[\left(Q_{\phi_i}(\mathbf{s}, \mathbf{a})-\left(r(\mathbf{s}, \mathbf{a})+\gamma \mathbb{E}_{\mathbf{a}^{\prime} \sim \pi_\theta\left(\cdot \mid \mathbf{s}^{\prime}\right)}\left[\min _{j=1, 2}} Q_{\phi_j^{\prime}}\left(\mathbf{s}^{\prime}, \mathbf{a}^{\prime}\right)-\alpha \log \pi_\theta\left(\mathbf{a}^{\prime} \mid \mathbf{s}^{\prime}\right)\right]\right)\right)^2\right] {\color{blue}{+ \mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \mathcal{\mu(a | s)}}\left[Q_{\phi_i^{\prime}}(s, a)\right]} +$$ +where $\mathcal{\mu(a | s)}$ is sampling from the current policy with randomness. + +Authors also propose maximizng values withing dataset for better approximation which should lead to the lower bound of the true values. + +The final critic loss is the following (change in blue) +$$ +\min _{\phi_i} \mathbb{E}_{\mathbf{s}, \mathbf{a}, \mathbf{s}^{\prime} \sim \mathcal{D}}\left[\left(Q_{\phi_i}(\mathbf{s}, \mathbf{a})-\left(r(\mathbf{s}, \mathbf{a})+\gamma \mathbb{E}_{\mathbf{a}^{\prime} \sim \pi_\theta\left(\cdot \mid \mathbf{s}^{\prime}\right)}\left[\min _{j=1, 2}} Q_{\phi_j^{\prime}}\left(\mathbf{s}^{\prime}, \mathbf{a}^{\prime}\right)-\alpha \log \pi_\theta\left(\mathbf{a}^{\prime} \mid \mathbf{s}^{\prime}\right)\right]\right)\right)^2\right] + {\color{blue}{\mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \mathcal{\mu(a | s)}}\left[Q_{\phi_i^{\prime}}(s, a)\right] - \mathbb{E}_{\mathbf{s} \sim \mathcal{D}, \mathbf{a} \sim \mathcal{\hat{\pi}_\beta(a | s)}}\left[Q_{\phi_i^{\prime}}(s, a)\right]} +$$ + + +There are more details and a number of CQL variants. To find out more about them, we redirect to the original work + +Original paper: + + * [Conservative Q-Learning for Offline Reinforcement Learning](https://arxiv.org/abs/2006.04779) + +Reference resources: + +* :material-github: [Official codebase for CQL (does not reproduce results from the paper)](https://github.com/aviralkumar2907/CQL) +* :material-github: [Working unofficial implementation for CQL (Pytorch)](https://github.com/young-geng/CQL) +* :material-github: [Working unofficial implementation for CQL (JAX)](https://github.com/young-geng/JaxCQL) + + +!!! warning + CQL has many hyperparameters and it is very sensitive to them. For example, our implementation wasn't able to achieve reasonable results without increasing the number of critic hidden layers. + +!!! warning + Due to the need in actions sampling CQL training runtime is slow comparing to other approaches. Usually it is about x4 time comparing of the backbone AC algorithm. + +!!! success + CQL is simple and fast in case of discrete actions space. + +Possible extensions: + +* [Cal-QL: Calibrated Offline RL Pre-Training for Efficient Online Fine-Tuning](https://arxiv.org/abs/2303.05479) + + +## Implemented Variants + +| Variants Implemented | Description | +|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------| +| :material-github: [`offline/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/offline/cql.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/offline/cql) | For continuous action spaces and offline RL. | +| :material-github: [`finetune/cql.py`](https://github.com/corl-team/CORL/blob/main/algorithms/finetune/cql.py)
:material-database: [configs](https://github.com/corl-team/CORL/tree/main/configs/finetune/cql) | For continuous action spaces and offline-to-online RL. | + + +## Explanation of some of logged metrics + +* `policy_loss`: mean actor loss. +* `alpha_loss`: mean SAC entropy loss. +* `qf{i}_loss`: mean i-th critic loss. +* `cql_q{i}_{next_actions, rand}`: Q mean values of i-th critic for next or random actions. +* `d4rl_normalized_score`: mean evaluation normalized score. Should be between 0 and 100, where 100+ is the + performance above expert for this environment. Implemented by D4RL library [[:material-github: source](https://github.com/Farama-Foundation/D4RL/blob/71a9549f2091accff93eeff68f1f3ab2c0e0a288/d4rl/offline_env.py#L71)]. + +## Implementation details + +1. Reward scaling (:material-github: [algorithms/offline/cql.py#L238](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/cql.py#L238)) +2. Increased critic size (:material-github: [algorithms/offline/cql.py#L392](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/cql.py#L392)) +3. Max target backup (:material-github: [algorithms/offline/cql.py#L568](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/cql.py#L568)) +4. Importance sample (:material-github: [algorithms/offline/cql.py#L647](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/cql.py#L647)) +5. CQL lagrange variant (:material-github: [algorithms/offline/cql.py#L681](https://github.com/corl-team/CORL/blob/e9768f90a95c809a5587dd888e203d0b76b07a39/algorithms/offline/cql.py#L681)) + +## Experimental results + +For detailed scores on all benchmarked datasets see [benchmarks section](../benchmarks/offline.md). +Reports visually compare our reproduction results with original paper scores to make sure our implementation is working properly. + + + + + +## Training options + +```commandline +usage: cql.py [-h] [--config_path str] [--device str] [--env str] [--seed str] [--eval_freq str] [--n_episodes str] [--max_timesteps str] [--checkpoints_path str] [--load_model str] [--buffer_size str] [--batch_size str] + [--discount str] [--alpha_multiplier str] [--use_automatic_entropy_tuning str] [--backup_entropy str] [--policy_lr str] [--qf_lr str] [--soft_target_update_rate str] [--target_update_period str] + [--cql_n_actions str] [--cql_importance_sample str] [--cql_lagrange str] [--cql_target_action_gap str] [--cql_temp str] [--cql_alpha str] [--cql_max_target_backup str] [--cql_clip_diff_min str] + [--cql_clip_diff_max str] [--orthogonal_init str] [--normalize str] [--normalize_reward str] [--q_n_hidden_layers str] [--reward_scale str] [--reward_bias str] [--bc_steps str] + [--policy_log_std_multiplier str] [--project str] [--group str] [--name str] + +options: + -h, --help show this help message and exit + --config_path str Path for a config file to parse with pyrallis + +TrainConfig: + + --device str Experiment + --env str OpenAI gym environment name + --seed str Sets Gym, PyTorch and Numpy seeds + --eval_freq str How often (time steps) we evaluate + --n_episodes str How many episodes run during evaluation + --max_timesteps str Max time steps to run environment + --checkpoints_path str + Save path + --load_model str Model load file name, "" doesn't load + --buffer_size str CQL + --batch_size str Batch size for all networks + --discount str Discount factor + --alpha_multiplier str + Multiplier for alpha in loss + --use_automatic_entropy_tuning str + Tune entropy + --backup_entropy str Use backup entropy + --policy_lr str Policy learning rate + --qf_lr str Critics learning rate + --soft_target_update_rate str + Target network update rate + --target_update_period str + Frequency of target nets updates + --cql_n_actions str Number of sampled actions + --cql_importance_sample str + Use importance sampling + --cql_lagrange str Use Lagrange version of CQL + --cql_target_action_gap str + Action gap + --cql_temp str CQL temperature + --cql_alpha str Minimal Q weight + --cql_max_target_backup str + Use max target backup + --cql_clip_diff_min str + Q-function lower loss clipping + --cql_clip_diff_max str + Q-function upper loss clipping + --orthogonal_init str + Orthogonal initialization + --normalize str Normalize states + --normalize_reward str + Normalize reward + --q_n_hidden_layers str + Number of hidden layers in Q networks + --reward_scale str Reward scale for normalization + --reward_bias str Reward bias for normalization + --bc_steps str AntMaze hacks + --policy_log_std_multiplier str + Stochastic policy std multiplier + --project str Wandb logging + --group str wandb group name + --name str wandb run name +```