optimizer.py

#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math

import paddle.fluid as fluid
import paddle.fluid.layers.ops as ops
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter

lr_strategy = 'cosine_decay'
l2_decay = 1e-4
step_epochs = [30, 60, 90]
momentum_rate = 0.9
warm_up_epochs = 5.0
num_epochs = 120
decay_epochs = 2.4
decay_rate = 0.97
total_images = 1281167


def cosine_decay(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    epoch = ops.floor(global_step / step_each_epoch)
    decayed_lr = learning_rate * \
                 (ops.cos(epoch * (math.pi / epochs)) + 1)/2
    return decayed_lr


def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    warmup_epoch = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(5), force_cpu=True)

    epoch = ops.floor(global_step / step_each_epoch)
    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(epoch < warmup_epoch):
            decayed_lr = learning_rate * (global_step /
                                          (step_each_epoch * warmup_epoch))
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
        with switch.default():
            decayed_lr = learning_rate * \
                (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr


def exponential_decay_with_warmup(learning_rate,
                                  step_each_epoch,
                                  decay_epochs,
                                  decay_rate=0.97,
                                  warm_up_epoch=5.0):
    """Applies exponential decay to the learning rate.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    warmup_epoch = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(warm_up_epoch), force_cpu=True)

    epoch = ops.floor(global_step / step_each_epoch)
    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(epoch < warmup_epoch):
            decayed_lr = learning_rate * (global_step /
                                          (step_each_epoch * warmup_epoch))
            fluid.layers.assign(input=decayed_lr, output=lr)
        with switch.default():
            div_res = (global_step - warmup_epoch * step_each_epoch
                       ) / decay_epochs
            div_res = ops.floor(div_res)
            decayed_lr = learning_rate * (decay_rate**div_res)
            fluid.layers.assign(input=decayed_lr, output=lr)

    return lr


def lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
    """ Applies linear learning rate warmup for distributed training
        Argument learning_rate can be float or a Variable
        lr = lr + (warmup_rate * step / warmup_steps)
    """
    assert (isinstance(end_lr, float))
    assert (isinstance(start_lr, float))
    linear_step = end_lr - start_lr
    with fluid.default_main_program()._lr_schedule_guard():
        lr = fluid.layers.tensor.create_global_var(
            shape=[1],
            value=0.0,
            dtype='float32',
            persistable=True,
            name="learning_rate_warmup")

        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
        )

        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
                decayed_lr = start_lr + linear_step * (global_step /
                                                       warmup_steps)
                fluid.layers.tensor.assign(decayed_lr, lr)
            with switch.default():
                fluid.layers.tensor.assign(learning_rate, lr)

        return lr


class Optimizer(object):
    """A class used to represent several optimizer methods

    Attributes:
        batch_size: batch size on all devices.
        lr: learning rate.
        lr_strategy: learning rate decay strategy.
        l2_decay: l2_decay parameter.
        momentum_rate: momentum rate when using Momentum optimizer.
        step_epochs: piecewise decay steps.
        num_epochs: number of total epochs.

        total_images: total images.
        step: total steps in the an epoch.
        
    """

    def __init__(self, args):
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.lr_strategy = lr_strategy
        self.l2_decay = l2_decay
        self.momentum_rate = momentum_rate
        self.step_epochs = step_epochs
        self.num_epochs = num_epochs
        self.warm_up_epochs = warm_up_epochs
        self.decay_epochs = decay_epochs
        self.decay_rate = decay_rate
        self.total_images = total_images

        self.step = int(math.ceil(float(self.total_images) / self.batch_size))

    def piecewise_decay(self):
        """piecewise decay with Momentum optimizer

            Returns:
            a piecewise_decay optimizer
        """
        bd = [self.step * e for e in self.step_epochs]
        lr = [self.lr * (0.1**i) for i in range(len(bd) + 1)]
        learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
        optimizer = fluid.optimizer.Momentum(
            learning_rate=learning_rate,
            momentum=self.momentum_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay))
        return optimizer

    def cosine_decay(self):
        """cosine decay with Momentum optimizer

        Returns:
            a cosine_decay optimizer
        """

        learning_rate = fluid.layers.cosine_decay(
            learning_rate=self.lr,
            step_each_epoch=self.step,
            epochs=self.num_epochs)
        optimizer = fluid.optimizer.Momentum(
            learning_rate=learning_rate,
            momentum=self.momentum_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay))
        return optimizer

    def cosine_decay_warmup(self):
        """cosine decay with warmup

        Returns:
            a cosine_decay_with_warmup optimizer
        """

        learning_rate = cosine_decay_with_warmup(
            learning_rate=self.lr,
            step_each_epoch=self.step,
            epochs=self.num_epochs)
        optimizer = fluid.optimizer.Momentum(
            learning_rate=learning_rate,
            momentum=self.momentum_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay))
        return optimizer

    def exponential_decay_warmup(self):
        """exponential decay with warmup

        Returns:
            a exponential_decay_with_warmup optimizer
        """

        learning_rate = exponential_decay_with_warmup(
            learning_rate=self.lr,
            step_each_epoch=self.step,
            decay_epochs=self.step * self.decay_epochs,
            decay_rate=self.decay_rate,
            warm_up_epoch=self.warm_up_epochs)
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=learning_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay),
            momentum=self.momentum_rate,
            rho=0.9,
            epsilon=0.001)
        return optimizer

    def linear_decay(self):
        """linear decay with Momentum optimizer

        Returns:
            a linear_decay optimizer
        """

        end_lr = 0
        learning_rate = fluid.layers.polynomial_decay(
            self.lr, self.step, end_lr, power=1)
        optimizer = fluid.optimizer.Momentum(
            learning_rate=learning_rate,
            momentum=self.momentum_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay))

        return optimizer

    def adam_decay(self):
        """Adam optimizer

        Returns: 
            an adam_decay optimizer
        """

        return fluid.optimizer.Adam(learning_rate=self.lr)

    def cosine_decay_RMSProp(self):
        """cosine decay with RMSProp optimizer

        Returns: 
            an cosine_decay_RMSProp optimizer
        """

        learning_rate = fluid.layers.cosine_decay(
            learning_rate=self.lr,
            step_each_epoch=self.step,
            epochs=self.num_epochs)
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=learning_rate,
            momentum=self.momentum_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay),
            # Apply epsilon=1 on ImageNet dataset.
            epsilon=1)
        return optimizer

    def default_decay(self):
        """default decay

        Returns:
            default decay optimizer
        """

        optimizer = fluid.optimizer.Momentum(
            learning_rate=self.lr,
            momentum=self.momentum_rate,
            regularization=fluid.regularizer.L2Decay(self.l2_decay))
        return optimizer


def create_optimizer(args):
    Opt = Optimizer(args)
    optimizer = getattr(Opt, lr_strategy)()

    return optimizer