lr_schedulers.py

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Modifications by Roshan Rao
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""

import logging
import math
from typing import Optional, Type

from torch.optim.lr_scheduler import LambdaLR

logger = logging.getLogger(__name__)


class ConstantLRSchedule(LambdaLR):
    """Constant learning rate schedule."""

    def __init__(
        self,
        optimizer,
        warmup_steps: Optional[int] = None,
        t_total: Optional[int] = None,
        last_epoch: int = -1,
    ):
        super(ConstantLRSchedule, self).__init__(
            optimizer, lambda _: 1.0, last_epoch=last_epoch
        )  # type: ignore


class WarmupConstantSchedule(LambdaLR):
    """Linear warmup and then constant.
    Linearly increases learning rate schedule from 0 to 1 over `warmup_steps`
    training steps. Keeps learning rate schedule equal to 1. after warmup_steps.
    """

    def __init__(
        self,
        optimizer,
        warmup_steps: int,
        t_total: Optional[int] = None,
        last_epoch: int = -1,
    ):
        self.warmup_steps = warmup_steps
        super(WarmupConstantSchedule, self).__init__(
            optimizer, self.lr_lambda, last_epoch=last_epoch
        )  # type: ignore

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1.0, self.warmup_steps))
        return 1.0


class WarmupLinearSchedule(LambdaLR):
    """Linear warmup and then linear decay.
    Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
    Linearly decreases learning rate from 1. to 0. over remaining
    `t_total - warmup_steps` steps.
    """

    def __init__(
        self, optimizer, warmup_steps: int, t_total: int, last_epoch: int = -1
    ):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        super(WarmupLinearSchedule, self).__init__(
            optimizer, self.lr_lambda, last_epoch=last_epoch
        )  # type: ignore

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(
            0.0,
            float(self.t_total - step)
            / float(max(1.0, self.t_total - self.warmup_steps)),
        )


class WarmupCosineSchedule(LambdaLR):
    """Linear warmup and then cosine decay.
    Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
    Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps
    following a cosine curve. If `cycles` (default=0.5) is different from default,
    learning rate follows cosine function after warmup.
    """

    def __init__(
        self,
        optimizer,
        warmup_steps: int,
        t_total: int,
        cycles: float = 0.5,
        last_epoch: int = -1,
    ):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        self.cycles = cycles
        super(WarmupCosineSchedule, self).__init__(
            optimizer, self.lr_lambda, last_epoch=last_epoch
        )  # type: ignore

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1.0, self.warmup_steps))
        # progress after warmup
        progress = float(step - self.warmup_steps) / float(
            max(1, self.t_total - self.warmup_steps)
        )
        return max(
            0.0, 0.5 * (1.0 + math.cos(math.pi * float(self.cycles) * 2.0 * progress))
        )


class WarmupCosineWithHardRestartsSchedule(LambdaLR):
    """Linear warmup and then cosine cycles with hard restarts.
    Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
    If `cycles` (default=1.) is different from default, learning rate follows `cycles`
    times a cosine decaying learning rate (with hard restarts).
    """

    def __init__(
        self,
        optimizer,
        warmup_steps: int,
        t_total: int,
        cycles: float = 1.0,
        last_epoch: int = -1,
    ):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        self.cycles = cycles
        super(WarmupCosineWithHardRestartsSchedule, self).__init__(
            optimizer, self.lr_lambda, last_epoch=last_epoch
        )  # type: ignore

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        # progress after warmup
        progress = float(step - self.warmup_steps) / float(
            max(1, self.t_total - self.warmup_steps)
        )
        if progress >= 1.0:
            return 0.0
        return max(
            0.0,
            0.5 * (1.0 + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))),
        )


LR_SCHEDULERS = {
    "constant": ConstantLRSchedule,
    "warmup_constant": WarmupConstantSchedule,
    "warmup_linear": WarmupLinearSchedule,
    "warmup_cosine": WarmupCosineSchedule,
    "warmup_cosine_with_restarts": WarmupCosineWithHardRestartsSchedule,
}


def get(scheduler: str) -> Type[LambdaLR]:
    try:
        return LR_SCHEDULERS[scheduler]
    except KeyError:
        raise KeyError(f"Unrecognized lr_scheduler {scheduler}")