Skip to content

Commit

Permalink
Merge branch 'main' into rebrac-finetune
Browse files Browse the repository at this point in the history
  • Loading branch information
DT6A authored Dec 6, 2023
2 parents d55af0e + 14abd6e commit 2ed69e3
Show file tree
Hide file tree
Showing 43 changed files with 1,810 additions and 216 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: ci
on:
push:
branches:
- main
- howuhh/docs-wip
permissions:
contents: write
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: 3.x
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
- uses: actions/cache@v3
with:
key: mkdocs-material-${{ env.cache_id }}
path: .cache
restore-keys: |
mkdocs-material-
- run: pip install mkdocs-material
- run: mkdocs gh-deploy --force
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,4 @@ dmypy.json
.json
.yaml
wandb
assets/
#assets/
178 changes: 94 additions & 84 deletions README.md

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions algorithms/finetune/cal_ql.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@ class TrainConfig:
mixing_ratio: float = 0.5 # Data mixing ratio for online tuning
is_sparse_reward: bool = False # Use sparse reward
# Wandb logging
project: str = "CORL"
group: str = "Cal-QL-D4RL"
name: str = "Cal-QL"
project: str = "CORL" # wandb project name
group: str = "Cal-QL-D4RL" # wandb group name
name: str = "Cal-QL" # wandb run name

def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
Expand Down Expand Up @@ -670,7 +670,7 @@ def _q_loss(
mc_returns: torch.Tensor,
alpha: torch.Tensor,
log_dict: Dict,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
) -> torch.Tensor:
q1_predicted = self.critic_1(observations, actions)
q2_predicted = self.critic_2(observations, actions)

Expand Down Expand Up @@ -874,7 +874,7 @@ def _q_loss(
)
)

return qf_loss, alpha_prime, alpha_prime_loss
return qf_loss

def train(self, batch: TensorBatch) -> Dict[str, float]:
(
Expand Down Expand Up @@ -904,7 +904,7 @@ def train(self, batch: TensorBatch) -> Dict[str, float]:
)

""" Q function loss """
qf_loss, alpha_prime, alpha_prime_loss = self._q_loss(
qf_loss = self._q_loss(
observations,
actions,
next_observations,
Expand Down
9 changes: 3 additions & 6 deletions algorithms/finetune/cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

@dataclass
class TrainConfig:
# Experiment
device: str = "cuda"
env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
Expand All @@ -37,7 +36,6 @@ class TrainConfig:
online_iterations: int = int(1e6) # Number of online updates
checkpoints_path: Optional[str] = None # Save path
load_model: str = "" # Model load file name, "" doesn't load
# CQL
buffer_size: int = 2_000_000 # Replay buffer size
batch_size: int = 256 # Batch size for all networks
discount: float = 0.99 # Discount factor
Expand Down Expand Up @@ -65,10 +63,9 @@ class TrainConfig:
q_n_hidden_layers: int = 2 # Number of hidden layers in Q networks
reward_scale: float = 1.0 # Reward scale for normalization
reward_bias: float = 0.0 # Reward bias for normalization
# Wandb logging
project: str = "CORL"
group: str = "CQL-D4RL"
name: str = "CQL"
project: str = "CORL" # wandb project name
group: str = "CQL-D4RL" # wandb group name
name: str = "CQL" # wandb run name

def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
Expand Down
48 changes: 31 additions & 17 deletions algorithms/offline/any_percent_bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,40 @@

@dataclass
class TrainConfig:
# Experiment
device: str = "cuda"
env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
eval_freq: int = int(5e3) # How often (time steps) we evaluate
n_episodes: int = 10 # How many episodes run during evaluation
max_timesteps: int = int(1e6) # Max time steps to run environment
checkpoints_path: Optional[str] = None # Save path
load_model: str = "" # Model load file name, "" doesn't load
batch_size: int = 256 # Batch size for all networks
discount: float = 0.99 # Discount factor
# BC
buffer_size: int = 2_000_000 # Replay buffer size
frac: float = 0.1 # Best data fraction to use
max_traj_len: int = 1000 # Max trajectory length
normalize: bool = True # Normalize states
# Wandb logging
# wandb project name
project: str = "CORL"
# wandb group name
group: str = "BC-D4RL"
# wandb run name
name: str = "BC"
# training dataset and evaluation environment
env: str = "halfcheetah-medium-expert-v2"
# total gradient updates during training
max_timesteps: int = int(1e6)
# training batch size
batch_size: int = 256
# maximum size of the replay buffer
buffer_size: int = 2_000_000
# what top fraction of the dataset (sorted by return) to use
frac: float = 0.1
# maximum possible trajectory length
max_traj_len: int = 1000
# whether to normalize states
normalize: bool = True
# discount factor
discount: float = 0.99
# evaluation frequency, will evaluate eval_freq training steps
eval_freq: int = int(5e3)
# number of episodes to run during evaluation
n_episodes: int = 10
# path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
# file name for loading a model, optional
load_model: str = ""
# training random seed
seed: int = 0
# training device
device: str = "cuda"

def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
Expand Down
48 changes: 34 additions & 14 deletions algorithms/offline/awac.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,49 @@

@dataclass
class TrainConfig:
# wandb project name
project: str = "CORL"
# wandb group name
group: str = "AWAC-D4RL"
# wandb run name
name: str = "AWAC"
checkpoints_path: Optional[str] = None

# training dataset and evaluation environment
env_name: str = "halfcheetah-medium-expert-v2"
seed: int = 42
test_seed: int = 69
deterministic_torch: bool = False
device: str = "cuda"

buffer_size: int = 2_000_000
num_train_ops: int = 1_000_000
batch_size: int = 256
eval_frequency: int = 1000
n_test_episodes: int = 10
normalize_reward: bool = False

# actor and critic hidden dim
hidden_dim: int = 256
# actor and critic learning rate
learning_rate: float = 3e-4
# discount factor
gamma: float = 0.99
# coefficient for the target critic Polyak's update
tau: float = 5e-3
# awac actor loss temperature, controlling balance
# between behaviour cloning and Q-value maximization
awac_lambda: float = 1.0
# total number of gradient updated during training
num_train_ops: int = 1_000_000
# training batch size
batch_size: int = 256
# maximum size of the replay buffer
buffer_size: int = 2_000_000
# whether to normalize reward (like in IQL)
normalize_reward: bool = False
# evaluation frequency, will evaluate every eval_frequency
# training steps
eval_frequency: int = 1000
# number of episodes to run during evaluation
n_test_episodes: int = 10
# path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
# configure PyTorch to use deterministic algorithms instead
# of nondeterministic ones
deterministic_torch: bool = False
# training random seed
seed: int = 42
# evaluation random seed
test_seed: int = 69
# training device
device: str = "cuda"

def __post_init__(self):
self.name = f"{self.name}-{self.env_name}-{str(uuid.uuid4())[:8]}"
Expand Down
19 changes: 6 additions & 13 deletions algorithms/offline/cql.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

@dataclass
class TrainConfig:
# Experiment
device: str = "cuda"
env: str = "halfcheetah-medium-expert-v2" # OpenAI gym environment name
seed: int = 0 # Sets Gym, PyTorch and Numpy seeds
Expand All @@ -32,8 +31,6 @@ class TrainConfig:
max_timesteps: int = int(1e6) # Max time steps to run environment
checkpoints_path: Optional[str] = None # Save path
load_model: str = "" # Model load file name, "" doesn't load

# CQL
buffer_size: int = 2_000_000 # Replay buffer size
batch_size: int = 256 # Batch size for all networks
discount: float = 0.99 # Discount factor
Expand All @@ -59,17 +56,13 @@ class TrainConfig:
q_n_hidden_layers: int = 3 # Number of hidden layers in Q networks
reward_scale: float = 1.0 # Reward scale for normalization
reward_bias: float = 0.0 # Reward bias for normalization

# AntMaze hacks
bc_steps: int = int(0) # Number of BC steps at start
reward_scale: float = 5.0
reward_bias: float = -1.0
policy_log_std_multiplier: float = 1.0

# Wandb logging
project: str = "CORL"
group: str = "CQL-D4RL"
name: str = "CQL"
reward_scale: float = 5.0 # Reward scale for normalization
reward_bias: float = -1.0 # Reward bias for normalization
policy_log_std_multiplier: float = 1.0 # Stochastic policy std multiplier
project: str = "CORL" # wandb project name
group: str = "CQL-D4RL" # wandb group name
name: str = "CQL" # wandb run name

def __post_init__(self):
self.name = f"{self.name}-{self.env}-{str(uuid.uuid4())[:8]}"
Expand Down
42 changes: 34 additions & 8 deletions algorithms/offline/dt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# inspiration:
# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py # noqa
# 1. https://github.com/kzl/decision-transformer/blob/master/gym/decision_transformer/models/decision_transformer.py
# 2. https://github.com/karpathy/minGPT
import os
import random
Expand All @@ -17,44 +17,70 @@
import wandb
from torch.nn import functional as F
from torch.utils.data import DataLoader, IterableDataset
from tqdm.auto import tqdm, trange # noqa
from tqdm.auto import trange

@dataclass
class TrainConfig:
# wandb params
# wandb project name
project: str = "CORL"
# wandb group name
group: str = "DT-D4RL"
# wandb run name
name: str = "DT"
# model params
# transformer hidden dim
embedding_dim: int = 128
# depth of the transformer model
num_layers: int = 3
# number of heads in the attention
num_heads: int = 1
# maximum sequence length during training
seq_len: int = 20
# maximum rollout length, needed for the positional embeddings
episode_len: int = 1000
# attention dropout
attention_dropout: float = 0.1
# residual dropout
residual_dropout: float = 0.1
# embeddings dropout
embedding_dropout: float = 0.1
# maximum range for the symmetric actions, [-1, 1]
max_action: float = 1.0
# training params
# training dataset and evaluation environment
env_name: str = "halfcheetah-medium-v2"
# AdamW optimizer learning rate
learning_rate: float = 1e-4
# AdamW optimizer betas
betas: Tuple[float, float] = (0.9, 0.999)
# AdamW weight decay
weight_decay: float = 1e-4
# maximum gradient norm during training, optional
clip_grad: Optional[float] = 0.25
# training batch size
batch_size: int = 64
# total training steps
update_steps: int = 100_000
# warmup steps for the learning rate scheduler
warmup_steps: int = 10_000
# reward scaling, to reduce the magnitude
reward_scale: float = 0.001
# number of workers for the pytorch dataloader
num_workers: int = 4
# evaluation params
# target return-to-go for the prompting durint evaluation
target_returns: Tuple[float, ...] = (12000.0, 6000.0)
# number of episodes to run during evaluation
eval_episodes: int = 100
# evaluation frequency, will evaluate eval_every training steps
eval_every: int = 10_000
# general params
# path for checkpoints saving, optional
checkpoints_path: Optional[str] = None
# configure PyTorch to use deterministic algorithms instead
# of nondeterministic ones
deterministic_torch: bool = False
# training random seed
train_seed: int = 10
# evaluation random seed
eval_seed: int = 42
# training device
device: str = "cuda"

def __post_init__(self):
Expand Down Expand Up @@ -180,7 +206,7 @@ def __prepare_sample(self, traj_idx, start_idx):

states = (states - self.state_mean) / self.state_std
returns = returns * self.reward_scale
# pad up to seq_len if needed
# pad up to seq_len if needed, padding is masked during training
mask = np.hstack(
[np.ones(states.shape[0]), np.zeros(self.seq_len - states.shape[0])]
)
Expand Down
Loading

0 comments on commit 2ed69e3

Please sign in to comment.