Merge branch 'main' into rebrac-finetune

corl-team · Dec 6, 2023 · 2ed69e3 · 2ed69e3
2 parents d55af0e + 14abd6e
commit 2ed69e3
Show file tree

Hide file tree

Showing 43 changed files with 1,810 additions and 216 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,25 @@
+name: ci
+on:
+  push:
+    branches:
+      - main
+      - howuhh/docs-wip
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v3
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: pip install mkdocs-material
+      - run: mkdocs gh-deploy --force
diff --git a/.gitignore b/.gitignore
@@ -145,4 +145,4 @@ dmypy.json
 .json
 .yaml
 wandb
-assets/
+#assets/
diff --git a/README.md b/README.md
diff --git a/algorithms/finetune/cal_ql.py b/algorithms/finetune/cal_ql.py
@@ -68,9 +68,9 @@ class TrainConfig:
     mixing_ratio: float = 0.5  # Data mixing ratio for online tuning
     is_sparse_reward: bool = False  # Use sparse reward
     # Wandb logging
-    project: str = "CORL"
-    group: str = "Cal-QL-D4RL"
-    name: str = "Cal-QL"
+    project: str = "CORL"  # wandb project name
+    group: str = "Cal-QL-D4RL"  # wandb group name
+    name: str = "Cal-QL"  # wandb run name
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
@@ -670,7 +670,7 @@ def _q_loss(
         mc_returns: torch.Tensor,
         alpha: torch.Tensor,
         log_dict: Dict,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         q1_predicted = self.critic_1(observations, actions)
         q2_predicted = self.critic_2(observations, actions)
 
@@ -874,7 +874,7 @@ def _q_loss(
             )
         )
 
-        return qf_loss, alpha_prime, alpha_prime_loss
+        return qf_loss
 
     def train(self, batch: TensorBatch) -> Dict[str, float]:
         (
@@ -904,7 +904,7 @@ def train(self, batch: TensorBatch) -> Dict[str, float]:
         )
 
         """ Q function loss """
-        qf_loss, alpha_prime, alpha_prime_loss = self._q_loss(
+        qf_loss = self._q_loss(
             observations,
             actions,
             next_observations,

diff --git a/algorithms/finetune/cql.py b/algorithms/finetune/cql.py
@@ -26,7 +26,6 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
     device: str = "cuda"
     env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
     seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
@@ -37,7 +36,6 @@ class TrainConfig:
     online_iterations: int = int(1e6)  # Number of online updates
     checkpoints_path: Optional[str] = None  # Save path
     load_model: str = ""  # Model load file name, "" doesn't load
-    # CQL
     buffer_size: int = 2_000_000  # Replay buffer size
     batch_size: int = 256  # Batch size for all networks
     discount: float = 0.99  # Discount factor
@@ -65,10 +63,9 @@ class TrainConfig:
     q_n_hidden_layers: int = 2  # Number of hidden layers in Q networks
     reward_scale: float = 1.0  # Reward scale for normalization
     reward_bias: float = 0.0  # Reward bias for normalization
-    # Wandb logging
-    project: str = "CORL"
-    group: str = "CQL-D4RL"
-    name: str = "CQL"
+    project: str = "CORL"  # wandb project name
+    group: str = "CQL-D4RL"  # wandb group name
+    name: str = "CQL"  # wandb run name
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"

diff --git a/algorithms/offline/any_percent_bc.py b/algorithms/offline/any_percent_bc.py
@@ -19,26 +19,40 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
-    device: str = "cuda"
-    env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
-    seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
-    eval_freq: int = int(5e3)  # How often (time steps) we evaluate
-    n_episodes: int = 10  # How many episodes run during evaluation
-    max_timesteps: int = int(1e6)  # Max time steps to run environment
-    checkpoints_path: Optional[str] = None  # Save path
-    load_model: str = ""  # Model load file name, "" doesn't load
-    batch_size: int = 256  # Batch size for all networks
-    discount: float = 0.99  # Discount factor
-    # BC
-    buffer_size: int = 2_000_000  # Replay buffer size
-    frac: float = 0.1  # Best data fraction to use
-    max_traj_len: int = 1000  # Max trajectory length
-    normalize: bool = True  # Normalize states
-    # Wandb logging
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "BC-D4RL"
+    # wandb run name
     name: str = "BC"
+    # training dataset and evaluation environment
+    env: str = "halfcheetah-medium-expert-v2"
+    # total gradient updates during training
+    max_timesteps: int = int(1e6)
+    # training batch size
+    batch_size: int = 256
+    # maximum size of the replay buffer
+    buffer_size: int = 2_000_000
+    # what top fraction of the dataset (sorted by return) to use
+    frac: float = 0.1
+    # maximum possible trajectory length
+    max_traj_len: int = 1000
+    # whether to normalize states
+    normalize: bool = True
+    # discount factor
+    discount: float = 0.99
+    # evaluation frequency, will evaluate eval_freq training steps
+    eval_freq: int = int(5e3)
+    # number of episodes to run during evaluation
+    n_episodes: int = 10
+    # path for checkpoints saving, optional
+    checkpoints_path: Optional[str] = None
+    # file name for loading a model, optional
+    load_model: str = ""
+    # training random seed
+    seed: int = 0
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"

diff --git a/algorithms/offline/awac.py b/algorithms/offline/awac.py
@@ -20,29 +20,49 @@
 
 @dataclass
 class TrainConfig:
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "AWAC-D4RL"
+    # wandb run name
     name: str = "AWAC"
-    checkpoints_path: Optional[str] = None
-
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-expert-v2"
-    seed: int = 42
-    test_seed: int = 69
-    deterministic_torch: bool = False
-    device: str = "cuda"
-
-    buffer_size: int = 2_000_000
-    num_train_ops: int = 1_000_000
-    batch_size: int = 256
-    eval_frequency: int = 1000
-    n_test_episodes: int = 10
-    normalize_reward: bool = False
-
+    # actor and critic hidden dim
     hidden_dim: int = 256
+    # actor and critic learning rate
     learning_rate: float = 3e-4
+    # discount factor
     gamma: float = 0.99
+    # coefficient for the target critic Polyak's update
     tau: float = 5e-3
+    # awac actor loss temperature, controlling balance
+    # between behaviour cloning and Q-value maximization
     awac_lambda: float = 1.0
+    # total number of gradient updated during training
+    num_train_ops: int = 1_000_000
+    # training batch size
+    batch_size: int = 256
+    # maximum size of the replay buffer
+    buffer_size: int = 2_000_000
+    # whether to normalize reward (like in IQL)
+    normalize_reward: bool = False
+    # evaluation frequency, will evaluate every eval_frequency
+    # training steps
+    eval_frequency: int = 1000
+    # number of episodes to run during evaluation
+    n_test_episodes: int = 10
+    # path for checkpoints saving, optional
+    checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
+    deterministic_torch: bool = False
+    # training random seed
+    seed: int = 42
+    # evaluation random seed
+    test_seed: int = 69
+    # training device
+    device: str = "cuda"
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}"

diff --git a/algorithms/offline/cql.py b/algorithms/offline/cql.py
@@ -23,7 +23,6 @@
 
 @dataclass
 class TrainConfig:
-    # Experiment
     device: str = "cuda"
     env: str = "halfcheetah-medium-expert-v2"  # OpenAI gym environment name
     seed: int = 0  # Sets Gym, PyTorch and Numpy seeds
@@ -32,8 +31,6 @@ class TrainConfig:
     max_timesteps: int = int(1e6)  # Max time steps to run environment
     checkpoints_path: Optional[str] = None  # Save path
     load_model: str = ""  # Model load file name, "" doesn't load
-
-    # CQL
     buffer_size: int = 2_000_000  # Replay buffer size
     batch_size: int = 256  # Batch size for all networks
     discount: float = 0.99  # Discount factor
@@ -59,17 +56,13 @@ class TrainConfig:
     q_n_hidden_layers: int = 3  # Number of hidden layers in Q networks
     reward_scale: float = 1.0  # Reward scale for normalization
     reward_bias: float = 0.0  # Reward bias for normalization
-
-    # AntMaze hacks
     bc_steps: int = int(0)  # Number of BC steps at start
-    reward_scale: float = 5.0
-    reward_bias: float = -1.0
-    policy_log_std_multiplier: float = 1.0
-
-    # Wandb logging
-    project: str = "CORL"
-    group: str = "CQL-D4RL"
-    name: str = "CQL"
+    reward_scale: float = 5.0  # Reward scale for normalization
+    reward_bias: float = -1.0  # Reward bias for normalization
+    policy_log_std_multiplier: float = 1.0  # Stochastic policy std multiplier
+    project: str = "CORL"  # wandb project name
+    group: str = "CQL-D4RL"  # wandb group name
+    name: str = "CQL"  # wandb run name
 
     def __post_init__(self):
         self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"

diff --git a/algorithms/offline/dt.py b/algorithms/offline/dt.py
@@ -1,5 +1,5 @@
 # inspiration:
-# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py  # noqa
+# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py
 # 2. https://github.com/karpathy/minGPT
 import os
 import random
@@ -17,44 +17,70 @@
 import wandb
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, IterableDataset
-from tqdm.auto import tqdm, trange  # noqa
+from tqdm.auto import trange
 
 @dataclass
 class TrainConfig:
-    # wandb params
+    # wandb project name
     project: str = "CORL"
+    # wandb group name
     group: str = "DT-D4RL"
+    # wandb run name
     name: str = "DT"
-    # model params
+    # transformer hidden dim
     embedding_dim: int = 128
+    # depth of the transformer model
     num_layers: int = 3
+    # number of heads in the attention
     num_heads: int = 1
+    # maximum sequence length during training
     seq_len: int = 20
+    # maximum rollout length, needed for the positional embeddings
     episode_len: int = 1000
+    # attention dropout
     attention_dropout: float = 0.1
+    # residual dropout
     residual_dropout: float = 0.1
+    # embeddings dropout
     embedding_dropout: float = 0.1
+    # maximum range for the symmetric actions, [-1, 1]
     max_action: float = 1.0
-    # training params
+    # training dataset and evaluation environment
     env_name: str = "halfcheetah-medium-v2"
+    # AdamW optimizer learning rate
     learning_rate: float = 1e-4
+    # AdamW optimizer betas
     betas: Tuple[float, float] = (0.9, 0.999)
+    # AdamW weight decay
     weight_decay: float = 1e-4
+    # maximum gradient norm during training, optional
     clip_grad: Optional[float] = 0.25
+    # training batch size
     batch_size: int = 64
+    # total training steps
     update_steps: int = 100_000
+    # warmup steps for the learning rate scheduler
     warmup_steps: int = 10_000
+    # reward scaling, to reduce the magnitude
     reward_scale: float = 0.001
+    # number of workers for the pytorch dataloader
     num_workers: int = 4
-    # evaluation params
+    # target return-to-go for the prompting durint evaluation
     target_returns: Tuple[float, ...] = (12000.0, 6000.0)
+    # number of episodes to run during evaluation
     eval_episodes: int = 100
+    # evaluation frequency, will evaluate eval_every training steps
     eval_every: int = 10_000
-    # general params
+    # path for checkpoints saving, optional
     checkpoints_path: Optional[str] = None
+    # configure PyTorch to use deterministic algorithms instead
+    # of nondeterministic ones
     deterministic_torch: bool = False
+    # training random seed
     train_seed: int = 10
+    # evaluation random seed
     eval_seed: int = 42
+    # training device
     device: str = "cuda"
 
     def __post_init__(self):
@@ -180,7 +206,7 @@ def __prepare_sample(self, traj_idx, start_idx):
 
         states = (states - self.state_mean) / self.state_std
         returns = returns * self.reward_scale
-        # pad up to seq_len if needed
+        # pad up to seq_len if needed, padding is masked during training
         mask = np.hstack(
             [np.ones(states.shape[0]), np.zeros(self.seq_len - states.shape[0])]
         )
-Original file line number
+Diff line change
@@ Expand Up / @@ -145,4 +145,4 @@ dmypy.json @@
     .json
     .yaml
     wandb
-    assets/
+    #assets/