lr.py 64.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy
import warnings
from paddle import Tensor

G
guguguzi 已提交
20
__all__ = [  # noqa
21 22 23 24 25 26 27 28 29 30 31 32
    'LRScheduler',
    'NoamDecay',
    'PiecewiseDecay',
    'NaturalExpDecay',
    'InverseTimeDecay',
    'PolynomialDecay',
    'LinearWarmup',
    'ExponentialDecay',
    'MultiStepDecay',
    'StepDecay',
    'LambdaDecay',
    'ReduceOnPlateau',
G
guguguzi 已提交
33 34
    'CosineAnnealingDecay',
    'MultiplicativeDecay'
35 36 37
]


38 39 40 41 42
class LRScheduler(object):
    """

    LRScheduler Base class. Define the common interface of a learning rate scheduler.

Z
Zhou Wei 已提交
43
    User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

    then overload it for your subclass and have a custom implementation of ``get_lr()`` .

    Otherwise, an ``NotImplementedError`` exception will be thrown.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        instance to schedule learning rate.

    Examples:
        Here is an example of a simple ``StepDecay`` implementation. 
G
guguguzi 已提交
59

60
        .. code-block:: python
G
guguguzi 已提交
61

62
            import paddle
Z
Zhou Wei 已提交
63
            from paddle.optimizer.lr import LRScheduler
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

            class StepDecay(LRScheduler):
                def __init__(self,
                            learning_rate,
                            step_size,
                            gamma=0.1,
                            last_epoch=-1,
                            verbose=False):
                    if not isinstance(step_size, int):
                        raise TypeError(
                            "The type of 'step_size' must be 'int', but received %s." %
                            type(step_size))
                    if gamma >= 1.0:
                        raise ValueError('gamma should be < 1.0.')

                    self.step_size = step_size
                    self.gamma = gamma
                    super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)

                def get_lr(self):
                    i = self.last_epoch // self.step_size
                    return self.base_lr * (self.gamma**i)
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102

    """

    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
                "The type of learning rate must be float, but received {}".
                format(type(learning_rate)))
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = last_epoch
        self.verbose = verbose
        self._var_name = None

        self.step()

    def __call__(self):
G
guguguzi 已提交
103
        """
104
        Return lastest computed learning rate on current epoch.
105 106 107 108 109
        """
        return self.last_lr

    def step(self, epoch=None):
        """
110

G
guguguzi 已提交
111
        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
112
        The new learning rate will take effect on next ``optimizer.step`` .
113 114 115 116 117 118

        Args:
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
119

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
        """
        if epoch is None:
            self.last_epoch += 1
            self.last_lr = self.get_lr()
        else:
            self.last_epoch = epoch
            if hasattr(self, "_get_closed_form_lr"):
                self.last_lr = self._get_closed_form_lr()
            else:
                self.last_lr = self.get_lr()

        if self.verbose:
            print('Epoch {}: {} set learning rate to {}.'.format(
                self.last_epoch, self.__class__.__name__, self.last_lr))

    def state_dict(self):
        """
137

138 139
        Returns the state of the scheduler as a :class:`dict`.

140
        It is a subset of ``self.__dict__`` .
141
        """
142
        self.state_keys()
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
        state_dict = {}
        for key in self.keys:
            if key not in self.__dict__:
                continue
            value = self.__dict__[key]
            if isinstance(value, Tensor):
                assert value.shape == [
                    1
                ], "shape of Tensor in state_dict must be [1] {}".format(
                    value.shape)
                value = value.numpy()[0]
            state_dict[key] = value

        return state_dict

158
    # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
159
    # (Note): you can change it for your subclass.
160
    def state_keys(self):
161
        """
162 163 164 165 166 167 168

        For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .

        ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.

        If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .

169 170 171
        """
        self.keys = ['last_epoch', 'last_lr']

172
    def set_state_dict(self, state_dict):
173
        """
174

175 176
        Loads the schedulers state.
        """
177
        self.state_keys()
178 179 180 181 182 183 184 185 186 187 188 189
        for key in self.keys:
            if key in state_dict:
                self.__dict__[key] = state_dict[key]
            else:
                raise RuntimeError(
                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
                    format(key))
        if len(state_dict) > len(self.keys):
            warnings.warn(
                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
            )

190 191
    # alias for set_state_dict
    set_dict = set_state_dict
192 193

    def get_lr(self):
194
        """
G
guguguzi 已提交
195

196 197 198 199
        For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .

        Otherwise, an ``NotImplementedError`` exception will be thrown.
        """
200 201 202 203
        # calculate by python float
        raise NotImplementedError


204
class NoamDecay(LRScheduler):
205
    r"""
206

G
guguguzi 已提交
207
    Applies Noam Decay to the initial learning rate.
208 209 210 211 212 213 214

    The algorithm can be described as following.

    .. math::

        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})

G
guguguzi 已提交
215
    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
216 217 218 219 220 221 222


    Args:
        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
223
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
224 225

    Returns:
226
        ``NoamDecay`` instance to schedule learning rate.
227 228 229 230 231 232 233

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

234
            # train on default dynamic graph mode
235
            linear = paddle.nn.Linear(10, 10)
236 237
            scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
238
            for epoch in range(20):
Z
Zhou Wei 已提交
239
                for batch_id in range(5):
240
                    x = paddle.uniform([10, 10])
241
                    out = linear(x)
C
chentianyu03 已提交
242
                    loss = paddle.mean(out)
243
                    loss.backward()
244 245
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
246 247
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
248

249
            # train on static graph mode
250 251 252 253
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
254 255
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
256 257
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
258
                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
259 260 261 262 263 264
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
265
                for batch_id in range(5):
266 267 268 269 270 271
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
272
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
273 274
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
275 276 277 278 279 280 281 282 283 284 285

    """

    def __init__(self,
                 d_model,
                 warmup_steps,
                 learning_rate=1.0,
                 last_epoch=-1,
                 verbose=False):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
286
        super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
287 288 289 290 291 292 293 294 295 296

    def get_lr(self):
        if self.last_epoch == 0:
            a = 1
        else:
            a = self.last_epoch**-0.5
        b = self.warmup_steps**-1.5 * self.last_epoch
        return self.base_lr * (self.d_model**-0.5) * min(a, b)


297
class PiecewiseDecay(LRScheduler):
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
    """

    Piecewise learning rate scheduler.

    The algorithm can be described as the code below:

    .. code-block:: text

        boundaries = [100, 200]
        values = [1.0, 0.5, 0.1]
        if epoch < 100:
            learning_rate = 1.0
        elif 100 <= global_step < 200:
            learning_rate = 0.5
        else:
            learning_rate = 0.1

    Args:
G
guguguzi 已提交
316 317
        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
318 319
            The type of element in the list is python float.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
320
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
321 322

    Returns:
323
        ``PiecewiseDecay`` instance to schedule learning rate.
324 325

    Examples:
G
guguguzi 已提交
326

327 328 329 330 331
        .. code-block:: python

            import paddle
            import numpy as np

332
            # train on default dynamic graph mode
333
            linear = paddle.nn.Linear(10, 10)
334 335
            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
336
            for epoch in range(20):
Z
Zhou Wei 已提交
337
                for batch_id in range(5):
338
                    x = paddle.uniform([10, 10])
339
                    out = linear(x)
C
chentianyu03 已提交
340
                    loss = paddle.mean(out)
341
                    loss.backward()
342 343
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
344 345
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
346

347
            # train on static graph mode
348 349 350 351
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
352 353
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
354 355
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
356
                scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
357 358 359 360 361 362
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
363
                for batch_id in range(5):
364 365 366 367 368 369
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
370
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
371 372
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
373 374 375 376 377
    """

    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
        self.boundaries = boundaries
        self.values = values
378
        super(PiecewiseDecay, self).__init__(
379 380 381 382 383 384 385 386 387
            last_epoch=last_epoch, verbose=verbose)

    def get_lr(self):
        for i in range(len(self.boundaries)):
            if self.last_epoch < self.boundaries[i]:
                return self.values[i]
        return self.values[len(self.values) - 1]


388
class NaturalExpDecay(LRScheduler):
389
    r"""
390 391

    Applies natural exponential decay to the initial learning rate.
G
guguguzi 已提交
392

393 394 395 396
    The algorithm can be described as following:

    .. math::

397
        new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
398 399 400

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
401
        gamma (float, optional): A Ratio to update the learning rate, should greater than 0.0 to make learning rate decay. Default: 0.1.
402
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
403
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
404 405

    Returns:
406
        ``NaturalExpDecay`` instance to schedule learning rate.
407 408

    Examples:
G
guguguzi 已提交
409

410 411 412 413 414
        .. code-block:: python

            import paddle
            import numpy as np

415
            # train on default dynamic graph mode
416
            linear = paddle.nn.Linear(10, 10)
417 418
            scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
419
            for epoch in range(20):
Z
Zhou Wei 已提交
420
                for batch_id in range(5):
421
                    x = paddle.uniform([10, 10])
422
                    out = linear(x)
C
chentianyu03 已提交
423
                    loss = paddle.mean(out)
424
                    loss.backward()
425 426
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
427 428
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
429

430
            # train on static graph mode
431 432 433 434
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
435 436
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
437 438
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
439
                scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
440 441 442 443 444 445
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
446
                for batch_id in range(5):
447 448 449 450 451 452
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
453
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
454 455
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
456 457 458
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
459
        assert gamma > 0.0, " 'gamma' must be a positive number so that the learning rate will decay."
460
        self.gamma = gamma
461 462
        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
                                              verbose)
463 464 465 466 467

    def get_lr(self):
        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)


468
class InverseTimeDecay(LRScheduler):
469
    r"""
470 471 472 473 474 475 476

    Applies inverse time decay to the initial learning rate.

    The algorithm can be described as following:

    .. math::

477
        new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
478 479 480

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
481
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
482 483
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
484
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
485 486

    Returns:
487
        ``InverseTimeDecay`` instance to schedule learning rate.
488 489

    Examples:
G
guguguzi 已提交
490

491 492 493 494 495
        .. code-block:: python

            import paddle
            import numpy as np

496
            # train on default dynamic graph mode
497
            linear = paddle.nn.Linear(10, 10)
498 499
            scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
500
            for epoch in range(20):
Z
Zhou Wei 已提交
501
                for batch_id in range(5):
502
                    x = paddle.uniform([10, 10])
503
                    out = linear(x)
C
chentianyu03 已提交
504
                    loss = paddle.mean(out)
505
                    loss.backward()
506 507
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
508 509
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
510

511
            # train on static graph mode
512 513 514 515
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
516 517
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
518 519
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
520
                scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
521 522 523 524 525 526
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
527
                for batch_id in range(5):
528 529 530 531 532 533
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
534
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
535 536
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
537 538 539 540 541

    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
        self.gamma = gamma
542 543
        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
544 545 546 547 548

    def get_lr(self):
        return self.base_lr / (1 + self.gamma * self.last_epoch)


549
class PolynomialDecay(LRScheduler):
550
    r"""
551 552 553 554 555 556 557 558 559

    Applies polynomial decay to the initial learning rate.

    The algorithm can be described as following.

    If cycle is set to True, then:

    .. math::

G
guguguzi 已提交
560
        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
561

562
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
563 564 565 566 567

    If cycle is set to False, then:

    .. math::

G
guguguzi 已提交
568
        epoch & = min(epoch, decay\_steps)
569

570
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
571 572 573 574


    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
575
        decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
576
        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
577
        power(float, optional): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 1.0.
G
guguguzi 已提交
578
        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
579 580
            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
581
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
582 583

    Returns:
584
        ``PolynomialDecay`` instance to schedule learning rate.
585 586

    Examples:
G
guguguzi 已提交
587

588 589 590 591 592
        .. code-block:: python

            import paddle
            import numpy as np

593
            # train on default dynamic graph mode
594
            linear = paddle.nn.Linear(10, 10)
595 596
            scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
597
            for epoch in range(20):
Z
Zhou Wei 已提交
598
                for batch_id in range(5):
599
                    x = paddle.uniform([10, 10])
600
                    out = linear(x)
C
chentianyu03 已提交
601
                    loss = paddle.mean(out)
602
                    loss.backward()
603 604
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
605 606
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
607

608
            # train on static graph mode
609 610 611 612
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
613 614
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
615 616
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
617
                scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
618 619 620 621 622 623
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
624
                for batch_id in range(5):
625 626 627 628 629 630
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
631
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
632 633
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
634 635 636 637 638 639 640 641 642 643
    """

    def __init__(self,
                 learning_rate,
                 decay_steps,
                 end_lr=0.0001,
                 power=1.0,
                 cycle=False,
                 last_epoch=-1,
                 verbose=False):
644 645
        assert decay_steps > 0 and isinstance(
            decay_steps, int), " 'decay_steps' must be a positive integer."
646 647
        self.decay_steps = decay_steps
        self.end_lr = end_lr
648
        assert power > 0.0, " 'power' must be greater than 0.0 so that the learning rate will decay."
649 650
        self.power = power
        self.cycle = cycle
651 652
        super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
                                              verbose)
653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671

    def get_lr(self):
        tmp_epoch_num = self.last_epoch
        tmp_decay_steps = self.decay_steps
        if self.cycle:
            div_res = math.ceil(
                float(self.last_epoch) / float(self.decay_steps))

            if self.last_epoch == 0:
                div_res = 1
            tmp_decay_steps = self.decay_steps * div_res
        else:
            tmp_epoch_num = min(self.last_epoch, self.decay_steps)

        return (self.base_lr - self.end_lr) * (
            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
             )**self.power) + self.end_lr


672
class LinearWarmup(LRScheduler):
673
    r"""
674 675 676

    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
G
guguguzi 已提交
677

678
    When epoch < warmup_steps, learning rate is updated as:
G
guguguzi 已提交
679

680
    .. math::
G
guguguzi 已提交
681

682
            lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
G
guguguzi 已提交
683

684
    where start_lr is the initial learning rate, and end_lr is the final learning rate;
G
guguguzi 已提交
685

686
    When epoch >= warmup_steps, learning rate is updated as:
G
guguguzi 已提交
687

688
    .. math::
G
guguguzi 已提交
689

690
            lr = learning_rate
G
guguguzi 已提交
691

692
    where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
693 694

    Args:
695
        learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
696
        warmup_steps (int): total steps of warm up. It must be a positive integer.
697 698 699
        start_lr (float): Initial learning rate of warm up.
        end_lr (float): Final learning rate of warm up.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
700
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
701 702

    Returns:
703
        ``LinearWarmup`` instance to schedule learning rate.
704 705

    Examples:
G
guguguzi 已提交
706

707 708 709 710 711
        .. code-block:: python

            import paddle
            import numpy as np

712
            # train on default dynamic graph mode
713
            linear = paddle.nn.Linear(10, 10)
714
            scheduler = paddle.optimizer.lr.LinearWarmup(
715
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
716
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
717
            for epoch in range(20):
Z
Zhou Wei 已提交
718
                for batch_id in range(5):
719
                    x = paddle.uniform([10, 10])
720
                    out = linear(x)
C
chentianyu03 已提交
721
                    loss = paddle.mean(out)
722
                    loss.backward()
723 724
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
725 726
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
727

728
            # train on static graph mode
729 730 731 732
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
733 734
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
735 736
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
737
                scheduler = paddle.optimizer.lr.LinearWarmup(
738 739 740 741 742 743 744
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
745
                for batch_id in range(5):
746 747 748 749 750 751
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
752
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
753 754
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
755 756 757 758 759 760 761 762 763 764
    """

    def __init__(self,
                 learning_rate,
                 warmup_steps,
                 start_lr,
                 end_lr,
                 last_epoch=-1,
                 verbose=False):
        type_check = isinstance(learning_rate, float) or isinstance(
765
            learning_rate, int) or isinstance(learning_rate, LRScheduler)
766 767
        if not type_check:
            raise TypeError(
768
                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
769 770
                format(learning_rate))
        self.learning_rate = learning_rate
771 772
        assert warmup_steps > 0 and isinstance(
            warmup_steps, int), " 'warmup_steps' must be a positive integer."
773 774 775 776 777
        self.warmup_steps = warmup_steps
        self.start_lr = start_lr
        self.end_lr = end_lr
        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
            end_lr, start_lr)
778
        super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
779

780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
    def state_dict(self):
        """
        Returns the state of the LinearWarmup scheduler as a :class:`dict`.

        It is a subset of ``self.__dict__`` .
        """
        state_dict = super(LinearWarmup, self).state_dict()
        if isinstance(self.learning_rate, LRScheduler):
            state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
        return state_dict

    def set_state_dict(self, state_dict):
        """
        Loads state_dict for LinearWarmup scheduler.
        """
        super(LinearWarmup, self).set_state_dict(state_dict)
        if isinstance(self.learning_rate, LRScheduler):
            self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])

799 800 801 802 803
    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return (self.end_lr - self.start_lr) * float(
                self.last_epoch) / float(self.warmup_steps) + self.start_lr
        else:
804
            if isinstance(self.learning_rate, LRScheduler):
805 806
                self.learning_rate.step(self.last_epoch - self.warmup_steps)
                return self.learning_rate()
807 808 809 810

            return self.learning_rate


811
class ExponentialDecay(LRScheduler):
812
    r"""
813

814
    Update learning rate by `gamma` each epoch.
815 816

    The algorithm can be described as following.
G
guguguzi 已提交
817

818 819 820 821 822 823
    .. math::

        new\_learning\_rate = last\_learning\_rate * gamma

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
824
        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
825
            It should be in interval (0.0, 1.0).
826
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
827
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
828 829

    Returns:
830
        ``ExponentialDecay`` instance to schedule learning rate.
831 832

    Examples:
G
guguguzi 已提交
833

834 835 836 837 838
        .. code-block:: python

            import paddle
            import numpy as np

839
            # train on default dynamic graph mode
840
            linear = paddle.nn.Linear(10, 10)
841 842
            scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
843
            for epoch in range(20):
Z
Zhou Wei 已提交
844
                for batch_id in range(5):
845
                    x = paddle.uniform([10, 10])
846
                    out = linear(x)
C
chentianyu03 已提交
847
                    loss = paddle.mean(out)
848
                    loss.backward()
849 850
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
851 852
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
853

854
            # train on static graph mode
855 856 857 858
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
859 860
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
861 862
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
863
                scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
864 865 866 867 868 869
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
870
                for batch_id in range(5):
871 872 873 874 875 876
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
877
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
878 879
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
880 881 882
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
883
        assert gamma > 0.0 and gamma < 1.0, " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
884
        self.gamma = gamma
885 886
        super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
887 888 889 890 891

    def get_lr(self):
        return self.base_lr * (self.gamma**self.last_epoch)


892
class MultiStepDecay(LRScheduler):
893
    """
894
    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
895

G
guguguzi 已提交
896
    The algorithm can be described as the code below.
897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912

    .. code-block:: text

        learning_rate = 0.5
        milestones = [30, 50]
        gamma = 0.1
        if epoch < 30:
            learning_rate = 0.5
        elif epoch < 50:
            learning_rate = 0.05
        else:
            learning_rate = 0.005

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
G
guguguzi 已提交
913
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
914 915
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
916
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
917

918 919

    Returns:
920
        ``MultiStepDecay`` instance to schedule learning rate.
921 922

    Examples:
G
guguguzi 已提交
923

924 925 926 927 928
        .. code-block:: python

            import paddle
            import numpy as np

929
            # train on default dynamic graph mode
930
            linear = paddle.nn.Linear(10, 10)
931 932
            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
933
            for epoch in range(20):
Z
Zhou Wei 已提交
934
                for batch_id in range(5):
935
                    x = paddle.uniform([10, 10])
936
                    out = linear(x)
C
chentianyu03 已提交
937
                    loss = paddle.mean(out)
938
                    loss.backward()
939 940
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
941 942
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
943

944
            # train on static graph mode
945 946 947 948
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
949 950
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
951 952
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
953
                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
954 955 956 957 958 959
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
960
                for batch_id in range(5):
961 962 963 964 965 966
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
967
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
968 969
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992
    """

    def __init__(self,
                 learning_rate,
                 milestones,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(milestones, (tuple, list)):
            raise TypeError(
                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
                % type(milestones))

        if not all([
                milestones[i] < milestones[i + 1]
                for i in range(len(milestones) - 1)
        ]):
            raise ValueError('The elements of milestones must be incremented')
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

        self.milestones = milestones
        self.gamma = gamma
993
        super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
994 995 996 997 998 999 1000 1001

    def get_lr(self):
        for i in range(len(self.milestones)):
            if self.last_epoch < self.milestones[i]:
                return self.base_lr * (self.gamma**i)
        return self.base_lr * (self.gamma**len(self.milestones))


1002
class StepDecay(LRScheduler):
1003 1004 1005
    """
    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.

G
guguguzi 已提交
1006
    The algorithm can be described as the code below.
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020

    .. code-block:: text

        learning_rate = 0.5
        step_size = 30
        gamma = 0.1

        learning_rate = 0.5     if epoch < 30
        learning_rate = 0.05    if 30 <= epoch < 60
        learning_rate = 0.005   if 60 <= epoch < 90
        ...

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
1021
        step_size (int): the interval to update. It must be a positive integer.
G
guguguzi 已提交
1022
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
1023 1024
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1025
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1026 1027

    Returns:
1028
        ``StepDecay`` instance to schedule learning rate.
1029 1030 1031


    Examples:
G
guguguzi 已提交
1032

1033 1034 1035 1036 1037
        .. code-block:: python

            import paddle
            import numpy as np

1038
            # train on default dynamic graph mode
1039
            linear = paddle.nn.Linear(10, 10)
1040 1041
            scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1042
            for epoch in range(20):
Z
Zhou Wei 已提交
1043
                for batch_id in range(5):
1044
                    x = paddle.uniform([10, 10])
1045
                    out = linear(x)
C
chentianyu03 已提交
1046
                    loss = paddle.mean(out)
1047
                    loss.backward()
1048 1049
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1050 1051
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1052

1053
            # train on static graph mode
1054 1055 1056 1057
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1058 1059
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1060 1061
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1062
                scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
1063 1064 1065 1066 1067 1068
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1069
                for batch_id in range(5):
1070 1071 1072 1073 1074 1075
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1076
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1077 1078
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
    """

    def __init__(self,
                 learning_rate,
                 step_size,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(step_size, int):
            raise TypeError(
                "The type of 'step_size' must be 'int', but received %s." %
                type(step_size))
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

1094 1095
        assert step_size > 0 and isinstance(
            step_size, int), " 'step_size' must be a positive integer."
1096 1097
        self.step_size = step_size
        self.gamma = gamma
1098
        super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
1099 1100 1101 1102 1103 1104

    def get_lr(self):
        i = self.last_epoch // self.step_size
        return self.base_lr * (self.gamma**i)


1105
class LambdaDecay(LRScheduler):
1106 1107 1108
    """
    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .

G
guguguzi 已提交
1109
    The algorithm can be described as the code below.
1110 1111 1112 1113 1114 1115

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95 ** epoch

1116 1117 1118
        learning_rate = 0.5        # epoch 0, 0.5*0.95**0
        learning_rate = 0.475      # epoch 1, 0.5*0.95**1
        learning_rate = 0.45125    # epoch 2, 0.5*0.95**2
1119 1120 1121 1122 1123

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1124
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
1125

1126
    Returns:
1127
        ``LambdaDecay`` instance to schedule learning rate.
1128 1129

    Examples:
G
guguguzi 已提交
1130

1131 1132 1133 1134 1135
        .. code-block:: python

            import paddle
            import numpy as np

1136
            # train on default dynamic graph mode
1137
            linear = paddle.nn.Linear(10, 10)
1138 1139
            scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1140
            for epoch in range(20):
Z
Zhou Wei 已提交
1141
                for batch_id in range(5):
1142
                    x = paddle.uniform([10, 10])
1143
                    out = linear(x)
C
chentianyu03 已提交
1144
                    loss = paddle.mean(out)
1145
                    loss.backward()
1146 1147
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1148 1149
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1150

1151
            # train on static graph mode
1152 1153 1154 1155
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1156 1157
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1158 1159
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1160
                scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
1161 1162 1163 1164 1165 1166
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1167
                for batch_id in range(5):
1168 1169 1170 1171 1172 1173
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1174
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1175 1176
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1177 1178 1179 1180 1181 1182

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
1183
                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
1184 1185 1186
                % type(lr_lambda))

        self.lr_lambda = lr_lambda
1187
        super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
1188 1189 1190 1191 1192

    def get_lr(self):
        return self.base_lr * self.lr_lambda(self.last_epoch)


1193
class ReduceOnPlateau(LRScheduler):
1194
    """
G
guguguzi 已提交
1195
    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
1196 1197
    by 2 to 10 times once model performance has no longer improvement.

G
guguguzi 已提交
1198 1199 1200
    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
1201 1202 1203 1204 1205 1206
    number of epochs, the learning rate will be reduced.)

    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
1207 1208
        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
1209
            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
G
guguguzi 已提交
1210
        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
1211
            It should be less than 1.0. Default: 0.1.
G
guguguzi 已提交
1212
        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
1213
            Default: 10.
G
guguguzi 已提交
1214
        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
1215 1216
            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
G
guguguzi 已提交
1217
            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
1218 1219 1220
            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
G
guguguzi 已提交
1221
        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
1222
            the update is ignored. Default: 1e-8.
1223 1224
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.

G
guguguzi 已提交
1225

1226
    Returns:
1227
        ``ReduceOnPlateau`` instance to schedule learning rate.
1228 1229 1230 1231 1232 1233 1234 1235


    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

1236
            # train on default dynamic graph mode
1237
            linear = paddle.nn.Linear(10, 10)
1238 1239
            scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1240
            for epoch in range(20):
Z
Zhou Wei 已提交
1241
                for batch_id in range(5):
1242
                    x = paddle.uniform([10, 10])
1243
                    out = linear(x)
C
chentianyu03 已提交
1244
                    loss = paddle.mean(out)
1245
                    loss.backward()
1246 1247
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1248 1249
                    scheduler.step(loss)    # If you update learning rate each step
              # scheduler.step(loss)        # If you update learning rate each epoch
1250

1251
            # train on static graph mode
1252 1253 1254 1255
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1256 1257
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1258 1259
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1260
                scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
1261 1262 1263 1264 1265 1266
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1267
                for batch_id in range(5):
1268 1269 1270 1271 1272 1273
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1274
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1275 1276
                    scheduler.step(out[0])    # If you update learning rate each step
              # scheduler.step(out[0])        # If you update learning rate each epoch
1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307

    """

    def __init__(self,
                 learning_rate,
                 mode='min',
                 factor=0.1,
                 patience=10,
                 threshold=1e-4,
                 threshold_mode='rel',
                 cooldown=0,
                 min_lr=0,
                 epsilon=1e-8,
                 verbose=False):
        mode = mode.lower()
        if mode not in ['min', 'max']:
            raise ValueError('mode: ' + mode + ' is unknown!')
        self.mode = mode

        if factor >= 1.0:
            raise ValueError(
                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
        self.factor = factor

        threshold_mode = threshold_mode.lower()
        if threshold_mode not in ['rel', 'abs']:
            raise ValueError('threshold mode: ' + threshold_mode +
                             ' is unknown!')
        self.threshold_mode = threshold_mode
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
1308
                "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329
                % type(learning_rate))

        self.patience = patience
        self.threshold = threshold
        self.threshold_mode = threshold_mode
        self.cooldown = cooldown
        self.min_lr = min_lr
        self.epsilon = epsilon

        self.cooldown_counter = 0
        self.best = None
        self.num_bad_epochs = 0

        # Can not call Parent __init__, so implement here.
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = 0
        self.verbose = verbose
        self._var_name = None

    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
1330
    def state_keys(self):
1331 1332 1333 1334 1335 1336 1337
        self.keys = [
            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
            'last_lr'
        ]

    def step(self, metrics, epoch=None):
        """
G
guguguzi 已提交
1338
        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
1339 1340 1341
        The new learning rate will take effect on next epoch.

        Args:
G
guguguzi 已提交
1342
            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
1343 1344 1345 1346 1347 1348
                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                'numpy.ndarray', its shape must be [1].
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
G
guguguzi 已提交
1349

1350
        Examples:
1351
            Please refer to the example of current LRScheduler.
1352 1353 1354 1355 1356 1357
        """
        if epoch is None:
            self.last_epoch = self.last_epoch + 1
        else:
            self.last_epoch = epoch

1358
        # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
1359 1360
        if isinstance(metrics, (Tensor, numpy.ndarray)):
            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
G
guguguzi 已提交
1361 1362 1363
                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
                                                                      "you should call paddle.mean to process it first.".format(
                metrics.shape)
1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
        elif not isinstance(metrics,
                            (int, float, numpy.float32, numpy.float64)):
            raise TypeError(
                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
                format(type(metrics)))

        if self.cooldown_counter > 0:
            self.cooldown_counter -= 1
        else:
            if self.best is None or self._is_better(metrics, self.best):
                self.best = metrics
                self.num_bad_epochs = 0
            else:
                self.num_bad_epochs += 1

            if self.num_bad_epochs > self.patience:
                self.cooldown_counter = self.cooldown
                self.num_bad_epochs = 0
                new_lr = max(self.last_lr * self.factor, self.min_lr)
                if self.last_lr - new_lr > self.epsilon:
                    self.last_lr = new_lr
                    if self.verbose:
                        print('Epoch {}: {} set learning rate to {}.'.format(
                            self.last_epoch, self.__class__.__name__,
                            self.last_lr))

    def _is_better(self, current, best):
        if self.mode == 'min' and self.threshold_mode == 'rel':
            return current < best - best * self.threshold

        elif self.mode == 'min' and self.threshold_mode == 'abs':
            return current < best - self.threshold

        elif self.mode == 'max' and self.threshold_mode == 'rel':
            return current > best + best * self.threshold

        else:
            return current > best + self.threshold


1404
class CosineAnnealingDecay(LRScheduler):
1405
    r"""
1406

G
guguguzi 已提交
1407 1408
    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
1409
    SGDR.
1410 1411 1412 1413

    The algorithm can be described as following.

    .. math::
1414

1415 1416
        \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
        + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
G
guguguzi 已提交
1417
        & T_{cur} \neq (2k+1)T_{max};
1418 1419 1420 1421

        \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
        \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
        & T_{cur} = (2k+1)T_{max}.
G
guguguzi 已提交
1422 1423

    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
1424
    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
G
guguguzi 已提交
1425

1426 1427
    Args:
        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
1428
        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
1429 1430
        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1431
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1432 1433

    Returns:
1434
        ``CosineAnnealingDecay`` instance to schedule learning rate.
1435 1436

    Examples:
G
guguguzi 已提交
1437

1438 1439 1440 1441 1442
        .. code-block:: python

            import paddle
            import numpy as np

1443
            # train on default dynamic graph mode
1444
            linear = paddle.nn.Linear(10, 10)
1445 1446
            scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1447
            for epoch in range(20):
Z
Zhou Wei 已提交
1448
                for batch_id in range(5):
1449
                    x = paddle.uniform([10, 10])
1450
                    out = linear(x)
C
chentianyu03 已提交
1451
                    loss = paddle.mean(out)
1452
                    loss.backward()
1453 1454
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1455 1456
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1457

1458
            # train on static graph mode
1459 1460 1461 1462
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1463 1464
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1465 1466
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1467
                scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
1468 1469 1470 1471 1472 1473
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1474
                for batch_id in range(5):
1475 1476 1477 1478 1479 1480
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1481
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1482 1483
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493
    """

    def __init__(self,
                 learning_rate,
                 T_max,
                 eta_min=0,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(T_max, int):
            raise TypeError(
1494
                "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
1495 1496 1497
                % type(T_max))
        if not isinstance(eta_min, (float, int)):
            raise TypeError(
1498
                "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
1499
                % type(eta_min))
1500 1501
        assert T_max > 0 and isinstance(
            T_max, int), " 'T_max' must be a positive integer."
1502 1503
        self.T_max = T_max
        self.eta_min = float(eta_min)
1504 1505
        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
                                                   verbose)
1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520

    def get_lr(self):
        if self.last_epoch == 0:
            return self.base_lr
        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
                math.pi / self.T_max)) / 2

        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
                self.last_lr - self.eta_min) + self.eta_min

    def _get_closed_form_lr(self):
        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
            math.pi * self.last_epoch / self.T_max)) / 2
G
guguguzi 已提交
1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585


class MultiplicativeDecay(LRScheduler):
    """
    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95

        learning_rate = 0.5        # epoch 0,
        learning_rate = 0.475      # epoch 1, 0.5*0.95
        learning_rate = 0.45125    # epoch 2, 0.475*0.95

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``MultiplicativeDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
                "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
                % type(lr_lambda))

        self.lr_lambda = lr_lambda
        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
                                                  verbose)

    def get_lr(self):
        if self.last_epoch > 0:
            return self.last_lr * self.lr_lambda(self.last_epoch)
        else:
            return self.base_lr