lr.py 63.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy
import warnings
from paddle import Tensor

G
guguguzi 已提交
20
__all__ = [  # noqa
21 22 23 24 25 26 27 28 29 30 31 32
    'LRScheduler',
    'NoamDecay',
    'PiecewiseDecay',
    'NaturalExpDecay',
    'InverseTimeDecay',
    'PolynomialDecay',
    'LinearWarmup',
    'ExponentialDecay',
    'MultiStepDecay',
    'StepDecay',
    'LambdaDecay',
    'ReduceOnPlateau',
G
guguguzi 已提交
33 34
    'CosineAnnealingDecay',
    'MultiplicativeDecay'
35 36 37
]


38 39 40 41 42
class LRScheduler(object):
    """

    LRScheduler Base class. Define the common interface of a learning rate scheduler.

Z
Zhou Wei 已提交
43
    User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

    then overload it for your subclass and have a custom implementation of ``get_lr()`` .

    Otherwise, an ``NotImplementedError`` exception will be thrown.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        instance to schedule learning rate.

    Examples:
        Here is an example of a simple ``StepDecay`` implementation. 
G
guguguzi 已提交
59

60
        .. code-block:: python
G
guguguzi 已提交
61

62
            import paddle
Z
Zhou Wei 已提交
63
            from paddle.optimizer.lr import LRScheduler
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

            class StepDecay(LRScheduler):
                def __init__(self,
                            learning_rate,
                            step_size,
                            gamma=0.1,
                            last_epoch=-1,
                            verbose=False):
                    if not isinstance(step_size, int):
                        raise TypeError(
                            "The type of 'step_size' must be 'int', but received %s." %
                            type(step_size))
                    if gamma >= 1.0:
                        raise ValueError('gamma should be < 1.0.')

                    self.step_size = step_size
                    self.gamma = gamma
                    super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)

                def get_lr(self):
                    i = self.last_epoch // self.step_size
                    return self.base_lr * (self.gamma**i)
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102

    """

    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
                "The type of learning rate must be float, but received {}".
                format(type(learning_rate)))
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = last_epoch
        self.verbose = verbose
        self._var_name = None

        self.step()

    def __call__(self):
G
guguguzi 已提交
103
        """
104
        Return lastest computed learning rate on current epoch.
105 106 107 108 109
        """
        return self.last_lr

    def step(self, epoch=None):
        """
110

G
guguguzi 已提交
111
        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
112
        The new learning rate will take effect on next ``optimizer.step`` .
113 114 115 116 117 118

        Args:
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
119

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
        """
        if epoch is None:
            self.last_epoch += 1
            self.last_lr = self.get_lr()
        else:
            self.last_epoch = epoch
            if hasattr(self, "_get_closed_form_lr"):
                self.last_lr = self._get_closed_form_lr()
            else:
                self.last_lr = self.get_lr()

        if self.verbose:
            print('Epoch {}: {} set learning rate to {}.'.format(
                self.last_epoch, self.__class__.__name__, self.last_lr))

    def state_dict(self):
        """
137

138 139
        Returns the state of the scheduler as a :class:`dict`.

140
        It is a subset of ``self.__dict__`` .
141
        """
142
        self.state_keys()
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
        state_dict = {}
        for key in self.keys:
            if key not in self.__dict__:
                continue
            value = self.__dict__[key]
            if isinstance(value, Tensor):
                assert value.shape == [
                    1
                ], "shape of Tensor in state_dict must be [1] {}".format(
                    value.shape)
                value = value.numpy()[0]
            state_dict[key] = value

        return state_dict

158
    # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
159
    # (Note): you can change it for your subclass.
160
    def state_keys(self):
161
        """
162 163 164 165 166 167 168

        For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .

        ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.

        If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .

169 170 171
        """
        self.keys = ['last_epoch', 'last_lr']

172
    def set_state_dict(self, state_dict):
173
        """
174

175 176
        Loads the schedulers state.
        """
177
        self.state_keys()
178 179 180 181 182 183 184 185 186 187 188 189
        for key in self.keys:
            if key in state_dict:
                self.__dict__[key] = state_dict[key]
            else:
                raise RuntimeError(
                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
                    format(key))
        if len(state_dict) > len(self.keys):
            warnings.warn(
                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
            )

190 191
    # alias for set_state_dict
    set_dict = set_state_dict
192 193

    def get_lr(self):
194
        """
G
guguguzi 已提交
195

196 197 198 199
        For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .

        Otherwise, an ``NotImplementedError`` exception will be thrown.
        """
200 201 202 203
        # calculate by python float
        raise NotImplementedError


204
class NoamDecay(LRScheduler):
205
    r"""
206

G
guguguzi 已提交
207
    Applies Noam Decay to the initial learning rate.
208 209 210 211 212 213 214

    The algorithm can be described as following.

    .. math::

        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})

G
guguguzi 已提交
215
    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
216 217 218 219 220 221 222


    Args:
        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
223
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
224 225

    Returns:
226
        ``NoamDecay`` instance to schedule learning rate.
227 228 229 230 231 232 233

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

234
            # train on default dynamic graph mode
235
            linear = paddle.nn.Linear(10, 10)
236 237
            scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
238
            for epoch in range(20):
Z
Zhou Wei 已提交
239
                for batch_id in range(5):
240
                    x = paddle.uniform([10, 10])
241
                    out = linear(x)
C
chentianyu03 已提交
242
                    loss = paddle.mean(out)
243
                    loss.backward()
244 245
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
246 247
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
248

249
            # train on static graph mode
250 251 252 253
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
254 255
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
256 257
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
258
                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
259 260 261 262 263 264
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
265
                for batch_id in range(5):
266 267 268 269 270 271
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
272
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
273 274
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
275 276 277 278 279 280 281 282 283 284 285

    """

    def __init__(self,
                 d_model,
                 warmup_steps,
                 learning_rate=1.0,
                 last_epoch=-1,
                 verbose=False):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
286
        super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
287 288 289 290 291 292 293 294 295 296

    def get_lr(self):
        if self.last_epoch == 0:
            a = 1
        else:
            a = self.last_epoch**-0.5
        b = self.warmup_steps**-1.5 * self.last_epoch
        return self.base_lr * (self.d_model**-0.5) * min(a, b)


297
class PiecewiseDecay(LRScheduler):
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
    """

    Piecewise learning rate scheduler.

    The algorithm can be described as the code below:

    .. code-block:: text

        boundaries = [100, 200]
        values = [1.0, 0.5, 0.1]
        if epoch < 100:
            learning_rate = 1.0
        elif 100 <= global_step < 200:
            learning_rate = 0.5
        else:
            learning_rate = 0.1

    Args:
G
guguguzi 已提交
316 317
        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
318 319
            The type of element in the list is python float.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
320
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
321 322

    Returns:
323
        ``PiecewiseDecay`` instance to schedule learning rate.
324 325

    Examples:
G
guguguzi 已提交
326

327 328 329 330 331
        .. code-block:: python

            import paddle
            import numpy as np

332
            # train on default dynamic graph mode
333
            linear = paddle.nn.Linear(10, 10)
334 335
            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
336
            for epoch in range(20):
Z
Zhou Wei 已提交
337
                for batch_id in range(5):
338
                    x = paddle.uniform([10, 10])
339
                    out = linear(x)
C
chentianyu03 已提交
340
                    loss = paddle.mean(out)
341
                    loss.backward()
342 343
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
344 345
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
346

347
            # train on static graph mode
348 349 350 351
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
352 353
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
354 355
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
356
                scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
357 358 359 360 361 362
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
363
                for batch_id in range(5):
364 365 366 367 368 369
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
370
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
371 372
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
373 374 375 376 377
    """

    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
        self.boundaries = boundaries
        self.values = values
378
        super(PiecewiseDecay, self).__init__(
379 380 381 382 383 384 385 386 387
            last_epoch=last_epoch, verbose=verbose)

    def get_lr(self):
        for i in range(len(self.boundaries)):
            if self.last_epoch < self.boundaries[i]:
                return self.values[i]
        return self.values[len(self.values) - 1]


388
class NaturalExpDecay(LRScheduler):
389
    r"""
390 391

    Applies natural exponential decay to the initial learning rate.
G
guguguzi 已提交
392

393 394 395 396
    The algorithm can be described as following:

    .. math::

397
        new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
398 399 400 401 402

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        gamma (float, optional): A Ratio to update the learning rate. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
403
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
404 405

    Returns:
406
        ``NaturalExpDecay`` instance to schedule learning rate.
407 408

    Examples:
G
guguguzi 已提交
409

410 411 412 413 414
        .. code-block:: python

            import paddle
            import numpy as np

415
            # train on default dynamic graph mode
416
            linear = paddle.nn.Linear(10, 10)
417 418
            scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
419
            for epoch in range(20):
Z
Zhou Wei 已提交
420
                for batch_id in range(5):
421
                    x = paddle.uniform([10, 10])
422
                    out = linear(x)
C
chentianyu03 已提交
423
                    loss = paddle.mean(out)
424
                    loss.backward()
425 426
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
427 428
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
429

430
            # train on static graph mode
431 432 433 434
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
435 436
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
437 438
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
439
                scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
440 441 442 443 444 445
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
446
                for batch_id in range(5):
447 448 449 450 451 452
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
453
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
454 455
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
456 457 458 459
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
        self.gamma = gamma
460 461
        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
                                              verbose)
462 463 464 465 466

    def get_lr(self):
        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)


467
class InverseTimeDecay(LRScheduler):
468
    r"""
469 470 471 472 473 474 475

    Applies inverse time decay to the initial learning rate.

    The algorithm can be described as following:

    .. math::

476
        new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
477 478 479

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
480
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
481 482
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
483
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
484 485

    Returns:
486
        ``InverseTimeDecay`` instance to schedule learning rate.
487 488

    Examples:
G
guguguzi 已提交
489

490 491 492 493 494
        .. code-block:: python

            import paddle
            import numpy as np

495
            # train on default dynamic graph mode
496
            linear = paddle.nn.Linear(10, 10)
497 498
            scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
499
            for epoch in range(20):
Z
Zhou Wei 已提交
500
                for batch_id in range(5):
501
                    x = paddle.uniform([10, 10])
502
                    out = linear(x)
C
chentianyu03 已提交
503
                    loss = paddle.mean(out)
504
                    loss.backward()
505 506
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
507 508
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
509

510
            # train on static graph mode
511 512 513 514
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
515 516
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
517 518
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
519
                scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
520 521 522 523 524 525
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
526
                for batch_id in range(5):
527 528 529 530 531 532
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
533
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
534 535
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
536 537 538 539 540

    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
        self.gamma = gamma
541 542
        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
543 544 545 546 547

    def get_lr(self):
        return self.base_lr / (1 + self.gamma * self.last_epoch)


548
class PolynomialDecay(LRScheduler):
549
    r"""
550 551 552 553 554 555 556 557 558

    Applies polynomial decay to the initial learning rate.

    The algorithm can be described as following.

    If cycle is set to True, then:

    .. math::

G
guguguzi 已提交
559
        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
560

561
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
562 563 564 565 566

    If cycle is set to False, then:

    .. math::

G
guguguzi 已提交
567
        epoch & = min(epoch, decay\_steps)
568

569
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
570 571 572 573


    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
574
        decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
575 576
        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
        power(float, optional): Power of polynomial. Default: 1.0.
G
guguguzi 已提交
577
        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
578 579
            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
580
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
581 582

    Returns:
583
        ``PolynomialDecay`` instance to schedule learning rate.
584 585

    Examples:
G
guguguzi 已提交
586

587 588 589 590 591
        .. code-block:: python

            import paddle
            import numpy as np

592
            # train on default dynamic graph mode
593
            linear = paddle.nn.Linear(10, 10)
594 595
            scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
596
            for epoch in range(20):
Z
Zhou Wei 已提交
597
                for batch_id in range(5):
598
                    x = paddle.uniform([10, 10])
599
                    out = linear(x)
C
chentianyu03 已提交
600
                    loss = paddle.mean(out)
601
                    loss.backward()
602 603
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
604 605
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
606

607
            # train on static graph mode
608 609 610 611
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
612 613
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
614 615
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
616
                scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
617 618 619 620 621 622
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
623
                for batch_id in range(5):
624 625 626 627 628 629
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
630
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
631 632
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
633 634 635 636 637 638 639 640 641 642
    """

    def __init__(self,
                 learning_rate,
                 decay_steps,
                 end_lr=0.0001,
                 power=1.0,
                 cycle=False,
                 last_epoch=-1,
                 verbose=False):
643 644
        assert decay_steps > 0 and isinstance(
            decay_steps, int), " 'decay_steps' must be a positive integer."
645 646 647 648
        self.decay_steps = decay_steps
        self.end_lr = end_lr
        self.power = power
        self.cycle = cycle
649 650
        super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
                                              verbose)
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669

    def get_lr(self):
        tmp_epoch_num = self.last_epoch
        tmp_decay_steps = self.decay_steps
        if self.cycle:
            div_res = math.ceil(
                float(self.last_epoch) / float(self.decay_steps))

            if self.last_epoch == 0:
                div_res = 1
            tmp_decay_steps = self.decay_steps * div_res
        else:
            tmp_epoch_num = min(self.last_epoch, self.decay_steps)

        return (self.base_lr - self.end_lr) * (
            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
             )**self.power) + self.end_lr


670
class LinearWarmup(LRScheduler):
671
    r"""
672 673 674

    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
G
guguguzi 已提交
675

676
    When epoch < warmup_steps, learning rate is updated as:
G
guguguzi 已提交
677

678
    .. math::
G
guguguzi 已提交
679

680
            lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
G
guguguzi 已提交
681

682
    where start_lr is the initial learning rate, and end_lr is the final learning rate;
G
guguguzi 已提交
683

684
    When epoch >= warmup_steps, learning rate is updated as:
G
guguguzi 已提交
685

686
    .. math::
G
guguguzi 已提交
687

688
            lr = learning_rate
G
guguguzi 已提交
689

690
    where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
691 692

    Args:
693
        learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
694
        warmup_steps (int): total steps of warm up. It must be a positive integer.
695 696 697
        start_lr (float): Initial learning rate of warm up.
        end_lr (float): Final learning rate of warm up.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
698
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
699 700

    Returns:
701
        ``LinearWarmup`` instance to schedule learning rate.
702 703

    Examples:
G
guguguzi 已提交
704

705 706 707 708 709
        .. code-block:: python

            import paddle
            import numpy as np

710
            # train on default dynamic graph mode
711
            linear = paddle.nn.Linear(10, 10)
712
            scheduler = paddle.optimizer.lr.LinearWarmup(
713
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
714
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
715
            for epoch in range(20):
Z
Zhou Wei 已提交
716
                for batch_id in range(5):
717
                    x = paddle.uniform([10, 10])
718
                    out = linear(x)
C
chentianyu03 已提交
719
                    loss = paddle.mean(out)
720
                    loss.backward()
721 722
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
723 724
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
725

726
            # train on static graph mode
727 728 729 730
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
731 732
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
733 734
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
735
                scheduler = paddle.optimizer.lr.LinearWarmup(
736 737 738 739 740 741 742
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
743
                for batch_id in range(5):
744 745 746 747 748 749
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
750
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
751 752
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
753 754 755 756 757 758 759 760 761 762
    """

    def __init__(self,
                 learning_rate,
                 warmup_steps,
                 start_lr,
                 end_lr,
                 last_epoch=-1,
                 verbose=False):
        type_check = isinstance(learning_rate, float) or isinstance(
763
            learning_rate, int) or isinstance(learning_rate, LRScheduler)
764 765
        if not type_check:
            raise TypeError(
766
                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
767 768
                format(learning_rate))
        self.learning_rate = learning_rate
769 770
        assert warmup_steps > 0 and isinstance(
            warmup_steps, int), " 'warmup_steps' must be a positive integer."
771 772 773 774 775
        self.warmup_steps = warmup_steps
        self.start_lr = start_lr
        self.end_lr = end_lr
        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
            end_lr, start_lr)
776
        super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
777

778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
    def state_dict(self):
        """
        Returns the state of the LinearWarmup scheduler as a :class:`dict`.

        It is a subset of ``self.__dict__`` .
        """
        state_dict = super(LinearWarmup, self).state_dict()
        if isinstance(self.learning_rate, LRScheduler):
            state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
        return state_dict

    def set_state_dict(self, state_dict):
        """
        Loads state_dict for LinearWarmup scheduler.
        """
        super(LinearWarmup, self).set_state_dict(state_dict)
        if isinstance(self.learning_rate, LRScheduler):
            self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])

797 798 799 800 801
    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return (self.end_lr - self.start_lr) * float(
                self.last_epoch) / float(self.warmup_steps) + self.start_lr
        else:
802
            if isinstance(self.learning_rate, LRScheduler):
803 804
                self.learning_rate.step(self.last_epoch - self.warmup_steps)
                return self.learning_rate()
805 806 807 808

            return self.learning_rate


809
class ExponentialDecay(LRScheduler):
810
    r"""
811

812
    Update learning rate by `gamma` each epoch.
813 814

    The algorithm can be described as following.
G
guguguzi 已提交
815

816 817 818 819 820 821
    .. math::

        new\_learning\_rate = last\_learning\_rate * gamma

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
822
        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
823
            It should be less than 1.0.
824
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
825
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
826 827

    Returns:
828
        ``ExponentialDecay`` instance to schedule learning rate.
829 830

    Examples:
G
guguguzi 已提交
831

832 833 834 835 836
        .. code-block:: python

            import paddle
            import numpy as np

837
            # train on default dynamic graph mode
838
            linear = paddle.nn.Linear(10, 10)
839 840
            scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
841
            for epoch in range(20):
Z
Zhou Wei 已提交
842
                for batch_id in range(5):
843
                    x = paddle.uniform([10, 10])
844
                    out = linear(x)
C
chentianyu03 已提交
845
                    loss = paddle.mean(out)
846
                    loss.backward()
847 848
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
849 850
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
851

852
            # train on static graph mode
853 854 855 856
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
857 858
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
859 860
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
861
                scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
862 863 864 865 866 867
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
868
                for batch_id in range(5):
869 870 871 872 873 874
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
875
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
876 877
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
878 879 880 881
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
        self.gamma = gamma
882 883
        super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
884 885 886 887 888

    def get_lr(self):
        return self.base_lr * (self.gamma**self.last_epoch)


889
class MultiStepDecay(LRScheduler):
890
    """
891
    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
892

G
guguguzi 已提交
893
    The algorithm can be described as the code below.
894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909

    .. code-block:: text

        learning_rate = 0.5
        milestones = [30, 50]
        gamma = 0.1
        if epoch < 30:
            learning_rate = 0.5
        elif epoch < 50:
            learning_rate = 0.05
        else:
            learning_rate = 0.005

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
G
guguguzi 已提交
910
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
911 912
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
913
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
914

915 916

    Returns:
917
        ``MultiStepDecay`` instance to schedule learning rate.
918 919

    Examples:
G
guguguzi 已提交
920

921 922 923 924 925
        .. code-block:: python

            import paddle
            import numpy as np

926
            # train on default dynamic graph mode
927
            linear = paddle.nn.Linear(10, 10)
928 929
            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
930
            for epoch in range(20):
Z
Zhou Wei 已提交
931
                for batch_id in range(5):
932
                    x = paddle.uniform([10, 10])
933
                    out = linear(x)
C
chentianyu03 已提交
934
                    loss = paddle.mean(out)
935
                    loss.backward()
936 937
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
938 939
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
940

941
            # train on static graph mode
942 943 944 945
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
946 947
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
948 949
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
950
                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
951 952 953 954 955 956
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
957
                for batch_id in range(5):
958 959 960 961 962 963
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
964
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
965 966
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989
    """

    def __init__(self,
                 learning_rate,
                 milestones,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(milestones, (tuple, list)):
            raise TypeError(
                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
                % type(milestones))

        if not all([
                milestones[i] < milestones[i + 1]
                for i in range(len(milestones) - 1)
        ]):
            raise ValueError('The elements of milestones must be incremented')
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

        self.milestones = milestones
        self.gamma = gamma
990
        super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
991 992 993 994 995 996 997 998

    def get_lr(self):
        for i in range(len(self.milestones)):
            if self.last_epoch < self.milestones[i]:
                return self.base_lr * (self.gamma**i)
        return self.base_lr * (self.gamma**len(self.milestones))


999
class StepDecay(LRScheduler):
1000 1001 1002
    """
    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.

G
guguguzi 已提交
1003
    The algorithm can be described as the code below.
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017

    .. code-block:: text

        learning_rate = 0.5
        step_size = 30
        gamma = 0.1

        learning_rate = 0.5     if epoch < 30
        learning_rate = 0.05    if 30 <= epoch < 60
        learning_rate = 0.005   if 60 <= epoch < 90
        ...

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
1018
        step_size (int): the interval to update. It must be a positive integer.
G
guguguzi 已提交
1019
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
1020 1021
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1022
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1023 1024

    Returns:
1025
        ``StepDecay`` instance to schedule learning rate.
1026 1027 1028


    Examples:
G
guguguzi 已提交
1029

1030 1031 1032 1033 1034
        .. code-block:: python

            import paddle
            import numpy as np

1035
            # train on default dynamic graph mode
1036
            linear = paddle.nn.Linear(10, 10)
1037 1038
            scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1039
            for epoch in range(20):
Z
Zhou Wei 已提交
1040
                for batch_id in range(5):
1041
                    x = paddle.uniform([10, 10])
1042
                    out = linear(x)
C
chentianyu03 已提交
1043
                    loss = paddle.mean(out)
1044
                    loss.backward()
1045 1046
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1047 1048
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1049

1050
            # train on static graph mode
1051 1052 1053 1054
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1055 1056
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1057 1058
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1059
                scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
1060 1061 1062 1063 1064 1065
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1066
                for batch_id in range(5):
1067 1068 1069 1070 1071 1072
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1073
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1074 1075
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
    """

    def __init__(self,
                 learning_rate,
                 step_size,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(step_size, int):
            raise TypeError(
                "The type of 'step_size' must be 'int', but received %s." %
                type(step_size))
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

1091 1092
        assert step_size > 0 and isinstance(
            step_size, int), " 'step_size' must be a positive integer."
1093 1094
        self.step_size = step_size
        self.gamma = gamma
1095
        super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
1096 1097 1098 1099 1100 1101

    def get_lr(self):
        i = self.last_epoch // self.step_size
        return self.base_lr * (self.gamma**i)


1102
class LambdaDecay(LRScheduler):
1103 1104 1105
    """
    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .

G
guguguzi 已提交
1106
    The algorithm can be described as the code below.
1107 1108 1109 1110 1111 1112

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95 ** epoch

1113 1114 1115
        learning_rate = 0.5        # epoch 0, 0.5*0.95**0
        learning_rate = 0.475      # epoch 1, 0.5*0.95**1
        learning_rate = 0.45125    # epoch 2, 0.5*0.95**2
1116 1117 1118 1119 1120

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1121
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
1122

1123
    Returns:
1124
        ``LambdaDecay`` instance to schedule learning rate.
1125 1126

    Examples:
G
guguguzi 已提交
1127

1128 1129 1130 1131 1132
        .. code-block:: python

            import paddle
            import numpy as np

1133
            # train on default dynamic graph mode
1134
            linear = paddle.nn.Linear(10, 10)
1135 1136
            scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1137
            for epoch in range(20):
Z
Zhou Wei 已提交
1138
                for batch_id in range(5):
1139
                    x = paddle.uniform([10, 10])
1140
                    out = linear(x)
C
chentianyu03 已提交
1141
                    loss = paddle.mean(out)
1142
                    loss.backward()
1143 1144
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1145 1146
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1147

1148
            # train on static graph mode
1149 1150 1151 1152
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1153 1154
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1155 1156
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1157
                scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
1158 1159 1160 1161 1162 1163
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1164
                for batch_id in range(5):
1165 1166 1167 1168 1169 1170
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1171
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1172 1173
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1174 1175 1176 1177 1178 1179

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
1180
                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
1181 1182 1183
                % type(lr_lambda))

        self.lr_lambda = lr_lambda
1184
        super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
1185 1186 1187 1188 1189

    def get_lr(self):
        return self.base_lr * self.lr_lambda(self.last_epoch)


1190
class ReduceOnPlateau(LRScheduler):
1191
    """
G
guguguzi 已提交
1192
    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
1193 1194
    by 2 to 10 times once model performance has no longer improvement.

G
guguguzi 已提交
1195 1196 1197
    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
1198 1199 1200 1201 1202 1203
    number of epochs, the learning rate will be reduced.)

    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
1204 1205
        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
1206
            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
G
guguguzi 已提交
1207
        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
1208
            It should be less than 1.0. Default: 0.1.
G
guguguzi 已提交
1209
        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
1210
            Default: 10.
G
guguguzi 已提交
1211
        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
1212 1213
            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
G
guguguzi 已提交
1214
            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
1215 1216 1217
            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
G
guguguzi 已提交
1218
        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
1219
            the update is ignored. Default: 1e-8.
1220 1221
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.

G
guguguzi 已提交
1222

1223
    Returns:
1224
        ``ReduceOnPlateau`` instance to schedule learning rate.
1225 1226 1227 1228 1229 1230 1231 1232


    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

1233
            # train on default dynamic graph mode
1234
            linear = paddle.nn.Linear(10, 10)
1235 1236
            scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1237
            for epoch in range(20):
Z
Zhou Wei 已提交
1238
                for batch_id in range(5):
1239
                    x = paddle.uniform([10, 10])
1240
                    out = linear(x)
C
chentianyu03 已提交
1241
                    loss = paddle.mean(out)
1242
                    loss.backward()
1243 1244
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1245 1246
                    scheduler.step(loss)    # If you update learning rate each step
              # scheduler.step(loss)        # If you update learning rate each epoch
1247

1248
            # train on static graph mode
1249 1250 1251 1252
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1253 1254
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1255 1256
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1257
                scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
1258 1259 1260 1261 1262 1263
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1264
                for batch_id in range(5):
1265 1266 1267 1268 1269 1270
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1271
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1272 1273
                    scheduler.step(out[0])    # If you update learning rate each step
              # scheduler.step(out[0])        # If you update learning rate each epoch
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304

    """

    def __init__(self,
                 learning_rate,
                 mode='min',
                 factor=0.1,
                 patience=10,
                 threshold=1e-4,
                 threshold_mode='rel',
                 cooldown=0,
                 min_lr=0,
                 epsilon=1e-8,
                 verbose=False):
        mode = mode.lower()
        if mode not in ['min', 'max']:
            raise ValueError('mode: ' + mode + ' is unknown!')
        self.mode = mode

        if factor >= 1.0:
            raise ValueError(
                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
        self.factor = factor

        threshold_mode = threshold_mode.lower()
        if threshold_mode not in ['rel', 'abs']:
            raise ValueError('threshold mode: ' + threshold_mode +
                             ' is unknown!')
        self.threshold_mode = threshold_mode
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
1305
                "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
                % type(learning_rate))

        self.patience = patience
        self.threshold = threshold
        self.threshold_mode = threshold_mode
        self.cooldown = cooldown
        self.min_lr = min_lr
        self.epsilon = epsilon

        self.cooldown_counter = 0
        self.best = None
        self.num_bad_epochs = 0

        # Can not call Parent __init__, so implement here.
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = 0
        self.verbose = verbose
        self._var_name = None

    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
1327
    def state_keys(self):
1328 1329 1330 1331 1332 1333 1334
        self.keys = [
            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
            'last_lr'
        ]

    def step(self, metrics, epoch=None):
        """
G
guguguzi 已提交
1335
        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
1336 1337 1338
        The new learning rate will take effect on next epoch.

        Args:
G
guguguzi 已提交
1339
            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
1340 1341 1342 1343 1344 1345
                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                'numpy.ndarray', its shape must be [1].
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
G
guguguzi 已提交
1346

1347
        Examples:
1348
            Please refer to the example of current LRScheduler.
1349 1350 1351 1352 1353 1354
        """
        if epoch is None:
            self.last_epoch = self.last_epoch + 1
        else:
            self.last_epoch = epoch

1355
        # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
1356 1357
        if isinstance(metrics, (Tensor, numpy.ndarray)):
            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
G
guguguzi 已提交
1358 1359 1360
                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
                                                                      "you should call paddle.mean to process it first.".format(
                metrics.shape)
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
        elif not isinstance(metrics,
                            (int, float, numpy.float32, numpy.float64)):
            raise TypeError(
                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
                format(type(metrics)))

        if self.cooldown_counter > 0:
            self.cooldown_counter -= 1
        else:
            if self.best is None or self._is_better(metrics, self.best):
                self.best = metrics
                self.num_bad_epochs = 0
            else:
                self.num_bad_epochs += 1

            if self.num_bad_epochs > self.patience:
                self.cooldown_counter = self.cooldown
                self.num_bad_epochs = 0
                new_lr = max(self.last_lr * self.factor, self.min_lr)
                if self.last_lr - new_lr > self.epsilon:
                    self.last_lr = new_lr
                    if self.verbose:
                        print('Epoch {}: {} set learning rate to {}.'.format(
                            self.last_epoch, self.__class__.__name__,
                            self.last_lr))

    def _is_better(self, current, best):
        if self.mode == 'min' and self.threshold_mode == 'rel':
            return current < best - best * self.threshold

        elif self.mode == 'min' and self.threshold_mode == 'abs':
            return current < best - self.threshold

        elif self.mode == 'max' and self.threshold_mode == 'rel':
            return current > best + best * self.threshold

        else:
            return current > best + self.threshold


1401
class CosineAnnealingDecay(LRScheduler):
1402
    r"""
1403

G
guguguzi 已提交
1404 1405
    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
1406
    SGDR.
1407 1408 1409 1410

    The algorithm can be described as following.

    .. math::
1411

1412 1413
        \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
        + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
G
guguguzi 已提交
1414
        & T_{cur} \neq (2k+1)T_{max};
1415 1416 1417 1418

        \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
        \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
        & T_{cur} = (2k+1)T_{max}.
G
guguguzi 已提交
1419 1420

    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
1421
    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
G
guguguzi 已提交
1422

1423 1424
    Args:
        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
1425
        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
1426 1427
        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1428
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1429 1430

    Returns:
1431
        ``CosineAnnealingDecay`` instance to schedule learning rate.
1432 1433

    Examples:
G
guguguzi 已提交
1434

1435 1436 1437 1438 1439
        .. code-block:: python

            import paddle
            import numpy as np

1440
            # train on default dynamic graph mode
1441
            linear = paddle.nn.Linear(10, 10)
1442 1443
            scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1444
            for epoch in range(20):
Z
Zhou Wei 已提交
1445
                for batch_id in range(5):
1446
                    x = paddle.uniform([10, 10])
1447
                    out = linear(x)
C
chentianyu03 已提交
1448
                    loss = paddle.mean(out)
1449
                    loss.backward()
1450 1451
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1452 1453
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1454

1455
            # train on static graph mode
1456 1457 1458 1459
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1460 1461
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1462 1463
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1464
                scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
1465 1466 1467 1468 1469 1470
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1471
                for batch_id in range(5):
1472 1473 1474 1475 1476 1477
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1478
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1479 1480
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1481 1482 1483 1484 1485 1486 1487 1488 1489 1490
    """

    def __init__(self,
                 learning_rate,
                 T_max,
                 eta_min=0,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(T_max, int):
            raise TypeError(
1491
                "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
1492 1493 1494
                % type(T_max))
        if not isinstance(eta_min, (float, int)):
            raise TypeError(
1495
                "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
1496
                % type(eta_min))
1497 1498
        assert T_max > 0 and isinstance(
            T_max, int), " 'T_max' must be a positive integer."
1499 1500
        self.T_max = T_max
        self.eta_min = float(eta_min)
1501 1502
        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
                                                   verbose)
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517

    def get_lr(self):
        if self.last_epoch == 0:
            return self.base_lr
        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
                math.pi / self.T_max)) / 2

        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
                self.last_lr - self.eta_min) + self.eta_min

    def _get_closed_form_lr(self):
        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
            math.pi * self.last_epoch / self.T_max)) / 2
G
guguguzi 已提交
1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582


class MultiplicativeDecay(LRScheduler):
    """
    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95

        learning_rate = 0.5        # epoch 0,
        learning_rate = 0.475      # epoch 1, 0.5*0.95
        learning_rate = 0.45125    # epoch 2, 0.475*0.95

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``MultiplicativeDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
                "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
                % type(lr_lambda))

        self.lr_lambda = lr_lambda
        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
                                                  verbose)

    def get_lr(self):
        if self.last_epoch > 0:
            return self.last_lr * self.lr_lambda(self.last_epoch)
        else:
            return self.base_lr