Skip to content

Commit

Permalink
[references] Unify sched + optim config and add AdamW as option (mind…
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 authored Dec 20, 2024
1 parent 5d4f28e commit e6bf82d
Show file tree
Hide file tree
Showing 11 changed files with 327 additions and 124 deletions.
43 changes: 29 additions & 14 deletions references/classification/train_pytorch_character.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@

import numpy as np
import torch
import wandb
from torch.nn.functional import cross_entropy
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR, PolynomialLR
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torchvision.transforms.v2 import (
Compose,
GaussianBlur,
InterpolationMode,
Normalize,
RandomGrayscale,
Expand Down Expand Up @@ -258,7 +256,7 @@ def main(args):
RandomPhotometricDistort(p=0.1),
T.RandomApply(T.RandomShadow(), p=0.4),
T.RandomApply(T.GaussianNoise(mean=0, std=0.1), 0.1),
T.RandomApply(GaussianBlur(3), 0.3),
T.RandomApply(T.GaussianBlur(sigma=(0.5, 1.5)), 0.3),
RandomPerspective(distortion_scale=0.2, p=0.3),
RandomRotation(15, interpolation=InterpolationMode.BILINEAR),
]),
Expand All @@ -281,31 +279,45 @@ def main(args):
return

# Optimizer
optimizer = torch.optim.Adam(
[p for p in model.parameters() if p.requires_grad],
args.lr,
betas=(0.95, 0.99),
eps=1e-6,
weight_decay=args.weight_decay,
)
if args.optim == "adam":
optimizer = torch.optim.Adam(
[p for p in model.parameters() if p.requires_grad],
args.lr,
betas=(0.95, 0.999),
eps=1e-6,
weight_decay=args.weight_decay,
)
elif args.optim == "adamw":
optimizer = torch.optim.AdamW(
[p for p in model.parameters() if p.requires_grad],
args.lr,
betas=(0.9, 0.999),
eps=1e-6,
weight_decay=args.weight_decay or 1e-4,
)

# LR Finder
if args.find_lr:
lrs, losses = record_lr(model, train_loader, batch_transforms, optimizer, amp=args.amp)
plot_recorder(lrs, losses)
return

# Scheduler
if args.sched == "cosine":
scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
elif args.sched == "onecycle":
scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
elif args.sched == "poly":
scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))

# Training monitoring
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name

# W&B
if args.wb:
import wandb

run = wandb.init(
name=exp_name,
project="character-classification",
Expand All @@ -316,7 +328,7 @@ def main(args):
"batch_size": args.batch_size,
"architecture": args.arch,
"input_size": args.input_size,
"optimizer": "adam",
"optimizer": args.optim,
"framework": "pytorch",
"vocab": args.vocab,
"scheduler": args.sched,
Expand Down Expand Up @@ -377,7 +389,7 @@ def parse_args():
parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training")
parser.add_argument("--device", default=None, type=int, help="device")
parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = H")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)")
parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay")
parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading")
parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint")
Expand Down Expand Up @@ -412,7 +424,10 @@ def parse_args():
help="Load pretrained parameters before starting the training",
)
parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX")
parser.add_argument("--sched", type=str, default="cosine", help="scheduler to use")
parser.add_argument("--optim", type=str, default="adam", choices=["adam", "adamw"], help="optimizer to use")
parser.add_argument(
"--sched", type=str, default="cosine", choices=["cosine", "onecycle", "poly"], help="scheduler to use"
)
parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true")
parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR")
parser.add_argument("--early-stop", action="store_true", help="Enable early stopping")
Expand Down
43 changes: 29 additions & 14 deletions references/classification/train_pytorch_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,12 @@

import numpy as np
import torch
import wandb
from torch.nn.functional import cross_entropy
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR, PolynomialLR
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torchvision.transforms import functional as F
from torchvision.transforms.v2 import (
Compose,
GaussianBlur,
Normalize,
RandomGrayscale,
RandomPerspective,
Expand Down Expand Up @@ -260,7 +258,7 @@ def main(args):
T.RandomApply(T.ColorInversion(), 0.1),
T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1),
T.RandomApply(T.RandomShadow(), 0.2),
T.RandomApply(GaussianBlur(kernel_size=3), 0.1),
T.RandomApply(T.GaussianBlur(sigma=(0.5, 1.5)), 0.3),
RandomPhotometricDistort(p=0.1),
RandomGrayscale(p=0.1),
RandomPerspective(distortion_scale=0.1, p=0.3),
Expand All @@ -287,31 +285,45 @@ def main(args):
return

# Optimizer
optimizer = torch.optim.Adam(
[p for p in model.parameters() if p.requires_grad],
args.lr,
betas=(0.95, 0.99),
eps=1e-6,
weight_decay=args.weight_decay,
)
if args.optim == "adam":
optimizer = torch.optim.Adam(
[p for p in model.parameters() if p.requires_grad],
args.lr,
betas=(0.95, 0.999),
eps=1e-6,
weight_decay=args.weight_decay,
)
elif args.optim == "adamw":
optimizer = torch.optim.AdamW(
[p for p in model.parameters() if p.requires_grad],
args.lr,
betas=(0.9, 0.999),
eps=1e-6,
weight_decay=args.weight_decay or 1e-4,
)

# LR Finder
if args.find_lr:
lrs, losses = record_lr(model, train_loader, batch_transforms, optimizer, amp=args.amp)
plot_recorder(lrs, losses)
return

# Scheduler
if args.sched == "cosine":
scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4)
elif args.sched == "onecycle":
scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader))
elif args.sched == "poly":
scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader))

# Training monitoring
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name

# W&B
if args.wb:
import wandb

run = wandb.init(
name=exp_name,
project="orientation-classification",
Expand All @@ -322,7 +334,7 @@ def main(args):
"batch_size": args.batch_size,
"architecture": args.arch,
"input_size": input_size,
"optimizer": "adam",
"optimizer": args.optim,
"framework": "pytorch",
"classes": CLASSES,
"scheduler": args.sched,
Expand Down Expand Up @@ -385,7 +397,7 @@ def parse_args():
parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
parser.add_argument("--device", default=None, type=int, help="device")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)")
parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay")
parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading")
parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint")
Expand All @@ -402,7 +414,10 @@ def parse_args():
help="Load pretrained parameters before starting the training",
)
parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX")
parser.add_argument("--sched", type=str, default="cosine", help="scheduler to use")
parser.add_argument("--optim", type=str, default="adam", choices=["adam", "adamw"], help="optimizer to use")
parser.add_argument(
"--sched", type=str, default="cosine", choices=["cosine", "onecycle", "poly"], help="scheduler to use"
)
parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true")
parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR")
parser.add_argument("--early-stop", action="store_true", help="Enable early stopping")
Expand Down
59 changes: 45 additions & 14 deletions references/classification/train_tensorflow_character.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,20 +234,45 @@ def main(args):
plot_samples(x, list(map(vocab.__getitem__, target)))
return

# Scheduler
if args.sched == "exponential":
scheduler = optimizers.schedules.ExponentialDecay(
args.lr,
decay_steps=args.epochs * len(train_loader),
decay_rate=1 / (25e4), # final lr as a fraction of initial lr
staircase=False,
name="ExponentialDecay",
)
elif args.sched == "poly":
scheduler = optimizers.schedules.PolynomialDecay(
args.lr,
decay_steps=args.epochs * len(train_loader),
end_learning_rate=1e-7,
power=1.0,
cycle=False,
name="PolynomialDecay",
)

# Optimizer
scheduler = optimizers.schedules.ExponentialDecay(
args.lr,
decay_steps=args.epochs * len(train_loader),
decay_rate=1 / (1e3), # final lr as a fraction of initial lr
staircase=False,
name="ExponentialDecay",
)
optimizer = optimizers.Adam(
learning_rate=scheduler,
beta_1=0.95,
beta_2=0.99,
epsilon=1e-6,
)
if args.optim == "adam":
optimizer = optimizers.Adam(
learning_rate=scheduler,
beta_1=0.95,
beta_2=0.999,
epsilon=1e-6,
clipnorm=5,
weight_decay=None if args.weight_decay == 0 else args.weight_decay,
)
elif args.optim == "adamw":
optimizer = optimizers.AdamW(
learning_rate=scheduler,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
clipnorm=5,
weight_decay=args.weight_decay or 1e-4,
)

if args.amp:
optimizer = mixed_precision.LossScaleOptimizer(optimizer)

Expand All @@ -264,6 +289,7 @@ def main(args):
config = {
"learning_rate": args.lr,
"epochs": args.epochs,
"weight_decay": args.weight_decay,
"batch_size": args.batch_size,
"architecture": args.arch,
"input_size": args.input_size,
Expand Down Expand Up @@ -351,7 +377,8 @@ def parse_args():
parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training")
parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = 4*H")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")
parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)")
parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint")
parser.add_argument(
"--font", type=str, default="FreeMono.ttf,FreeSans.ttf,FreeSerif.ttf", help="Font family to be used"
Expand Down Expand Up @@ -384,6 +411,10 @@ def parse_args():
action="store_true",
help="Load pretrained parameters before starting the training",
)
parser.add_argument("--optim", type=str, default="adam", choices=["adam", "adamw"], help="optimizer to use")
parser.add_argument(
"--sched", type=str, default="exponential", choices=["exponential", "poly"], help="scheduler to use"
)
parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX")
parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true")
parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR")
Expand Down
62 changes: 45 additions & 17 deletions references/classification/train_tensorflow_orientation.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,20 +244,45 @@ def main(args):
plot_samples(x, [CLASSES[t] for t in target])
return

# Scheduler
if args.sched == "exponential":
scheduler = optimizers.schedules.ExponentialDecay(
args.lr,
decay_steps=args.epochs * len(train_loader),
decay_rate=1 / (25e4), # final lr as a fraction of initial lr
staircase=False,
name="ExponentialDecay",
)
elif args.sched == "poly":
scheduler = optimizers.schedules.PolynomialDecay(
args.lr,
decay_steps=args.epochs * len(train_loader),
end_learning_rate=1e-7,
power=1.0,
cycle=False,
name="PolynomialDecay",
)

# Optimizer
scheduler = optimizers.schedules.ExponentialDecay(
args.lr,
decay_steps=args.epochs * len(train_loader),
decay_rate=1 / (1e3), # final lr as a fraction of initial lr
staircase=False,
name="ExponentialDecay",
)
optimizer = optimizers.Adam(
learning_rate=scheduler,
beta_1=0.95,
beta_2=0.99,
epsilon=1e-6,
)
if args.optim == "adam":
optimizer = optimizers.Adam(
learning_rate=scheduler,
beta_1=0.95,
beta_2=0.999,
epsilon=1e-6,
clipnorm=5,
weight_decay=None if args.weight_decay == 0 else args.weight_decay,
)
elif args.optim == "adamw":
optimizer = optimizers.AdamW(
learning_rate=scheduler,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
clipnorm=5,
weight_decay=args.weight_decay or 1e-4,
)

if args.amp:
optimizer = mixed_precision.LossScaleOptimizer(optimizer)

Expand All @@ -277,8 +302,8 @@ def main(args):
"batch_size": args.batch_size,
"architecture": args.arch,
"input_size": input_size,
"optimizer": "adam",
"framework": "pytorch",
"optimizer": args.optim,
"framework": "tensorflow",
"classes": CLASSES,
"scheduler": args.sched,
"pretrained": args.pretrained,
Expand Down Expand Up @@ -364,8 +389,8 @@ def parse_args():
parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on")
parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training")
parser.add_argument("--device", default=None, type=int, help="device")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)")
parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay")
parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)")
parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint")
parser.add_argument("--test-only", dest="test_only", action="store_true", help="Run the validation loop")
parser.add_argument(
Expand All @@ -381,7 +406,10 @@ def parse_args():
help="Load pretrained parameters before starting the training",
)
parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX")
parser.add_argument("--sched", type=str, default="cosine", help="scheduler to use")
parser.add_argument("--optim", type=str, default="adam", choices=["adam", "adamw"], help="optimizer to use")
parser.add_argument(
"--sched", type=str, default="exponential", choices=["exponential", "poly"], help="scheduler to use"
)
parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true")
parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR")
parser.add_argument("--early-stop", action="store_true", help="Enable early stopping")
Expand Down
Loading

0 comments on commit e6bf82d

Please sign in to comment.