lr.py 64.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy
import warnings
from paddle import Tensor
19 20
import paddle.fluid.core as core
from ..fluid.framework import _in_eager_mode
21

G
guguguzi 已提交
22
__all__ = [  # noqa
23 24 25 26 27 28 29 30 31 32 33 34
    'LRScheduler',
    'NoamDecay',
    'PiecewiseDecay',
    'NaturalExpDecay',
    'InverseTimeDecay',
    'PolynomialDecay',
    'LinearWarmup',
    'ExponentialDecay',
    'MultiStepDecay',
    'StepDecay',
    'LambdaDecay',
    'ReduceOnPlateau',
G
guguguzi 已提交
35 36
    'CosineAnnealingDecay',
    'MultiplicativeDecay'
37 38 39
]


40 41 42 43 44
class LRScheduler(object):
    """

    LRScheduler Base class. Define the common interface of a learning rate scheduler.

Z
Zhou Wei 已提交
45
    User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60

    then overload it for your subclass and have a custom implementation of ``get_lr()`` .

    Otherwise, an ``NotImplementedError`` exception will be thrown.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        instance to schedule learning rate.

    Examples:
        Here is an example of a simple ``StepDecay`` implementation. 
G
guguguzi 已提交
61

62
        .. code-block:: python
G
guguguzi 已提交
63

64
            import paddle
Z
Zhou Wei 已提交
65
            from paddle.optimizer.lr import LRScheduler
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87

            class StepDecay(LRScheduler):
                def __init__(self,
                            learning_rate,
                            step_size,
                            gamma=0.1,
                            last_epoch=-1,
                            verbose=False):
                    if not isinstance(step_size, int):
                        raise TypeError(
                            "The type of 'step_size' must be 'int', but received %s." %
                            type(step_size))
                    if gamma >= 1.0:
                        raise ValueError('gamma should be < 1.0.')

                    self.step_size = step_size
                    self.gamma = gamma
                    super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)

                def get_lr(self):
                    i = self.last_epoch // self.step_size
                    return self.base_lr * (self.gamma**i)
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104

    """

    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
                "The type of learning rate must be float, but received {}".
                format(type(learning_rate)))
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = last_epoch
        self.verbose = verbose
        self._var_name = None

        self.step()

    def __call__(self):
G
guguguzi 已提交
105
        """
106
        Return lastest computed learning rate on current epoch.
107 108 109 110 111
        """
        return self.last_lr

    def step(self, epoch=None):
        """
112

G
guguguzi 已提交
113
        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
114
        The new learning rate will take effect on next ``optimizer.step`` .
115 116 117 118 119 120

        Args:
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
121

122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
        """
        if epoch is None:
            self.last_epoch += 1
            self.last_lr = self.get_lr()
        else:
            self.last_epoch = epoch
            if hasattr(self, "_get_closed_form_lr"):
                self.last_lr = self._get_closed_form_lr()
            else:
                self.last_lr = self.get_lr()

        if self.verbose:
            print('Epoch {}: {} set learning rate to {}.'.format(
                self.last_epoch, self.__class__.__name__, self.last_lr))

    def state_dict(self):
        """
139

140 141
        Returns the state of the scheduler as a :class:`dict`.

142
        It is a subset of ``self.__dict__`` .
143
        """
144
        self.state_keys()
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
        state_dict = {}
        for key in self.keys:
            if key not in self.__dict__:
                continue
            value = self.__dict__[key]
            if isinstance(value, Tensor):
                assert value.shape == [
                    1
                ], "shape of Tensor in state_dict must be [1] {}".format(
                    value.shape)
                value = value.numpy()[0]
            state_dict[key] = value

        return state_dict

160
    # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
161
    # (Note): you can change it for your subclass.
162
    def state_keys(self):
163
        """
164 165 166 167 168 169 170

        For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .

        ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.

        If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .

171 172 173
        """
        self.keys = ['last_epoch', 'last_lr']

174
    def set_state_dict(self, state_dict):
175
        """
176

177 178
        Loads the schedulers state.
        """
179
        self.state_keys()
180 181 182 183 184 185 186 187 188 189 190 191
        for key in self.keys:
            if key in state_dict:
                self.__dict__[key] = state_dict[key]
            else:
                raise RuntimeError(
                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
                    format(key))
        if len(state_dict) > len(self.keys):
            warnings.warn(
                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
            )

192 193
    # alias for set_state_dict
    set_dict = set_state_dict
194 195

    def get_lr(self):
196
        """
G
guguguzi 已提交
197

198 199 200 201
        For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .

        Otherwise, an ``NotImplementedError`` exception will be thrown.
        """
202 203 204 205
        # calculate by python float
        raise NotImplementedError


206
class NoamDecay(LRScheduler):
207
    r"""
208

G
guguguzi 已提交
209
    Applies Noam Decay to the initial learning rate.
210 211 212 213 214 215 216

    The algorithm can be described as following.

    .. math::

        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})

G
guguguzi 已提交
217
    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
218 219 220 221 222 223 224


    Args:
        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
225
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
226 227

    Returns:
228
        ``NoamDecay`` instance to schedule learning rate.
229 230 231 232 233 234 235

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

236
            # train on default dynamic graph mode
237
            linear = paddle.nn.Linear(10, 10)
238 239
            scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
240
            for epoch in range(20):
Z
Zhou Wei 已提交
241
                for batch_id in range(5):
242
                    x = paddle.uniform([10, 10])
243
                    out = linear(x)
C
chentianyu03 已提交
244
                    loss = paddle.mean(out)
245
                    loss.backward()
246 247
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
248 249
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
250

251
            # train on static graph mode
252 253 254 255
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
256 257
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
258 259
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
260
                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
261 262 263 264 265 266
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
267
                for batch_id in range(5):
268 269 270 271 272 273
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
274
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
275 276
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
277 278 279 280 281 282 283 284 285 286 287

    """

    def __init__(self,
                 d_model,
                 warmup_steps,
                 learning_rate=1.0,
                 last_epoch=-1,
                 verbose=False):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
288
        super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
289 290 291 292 293 294 295 296 297 298

    def get_lr(self):
        if self.last_epoch == 0:
            a = 1
        else:
            a = self.last_epoch**-0.5
        b = self.warmup_steps**-1.5 * self.last_epoch
        return self.base_lr * (self.d_model**-0.5) * min(a, b)


299
class PiecewiseDecay(LRScheduler):
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
    """

    Piecewise learning rate scheduler.

    The algorithm can be described as the code below:

    .. code-block:: text

        boundaries = [100, 200]
        values = [1.0, 0.5, 0.1]
        if epoch < 100:
            learning_rate = 1.0
        elif 100 <= global_step < 200:
            learning_rate = 0.5
        else:
            learning_rate = 0.1

    Args:
G
guguguzi 已提交
318 319
        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
320 321
            The type of element in the list is python float.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
322
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
323 324

    Returns:
325
        ``PiecewiseDecay`` instance to schedule learning rate.
326 327

    Examples:
G
guguguzi 已提交
328

329 330 331 332 333
        .. code-block:: python

            import paddle
            import numpy as np

334
            # train on default dynamic graph mode
335
            linear = paddle.nn.Linear(10, 10)
336 337
            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
338
            for epoch in range(20):
Z
Zhou Wei 已提交
339
                for batch_id in range(5):
340
                    x = paddle.uniform([10, 10])
341
                    out = linear(x)
C
chentianyu03 已提交
342
                    loss = paddle.mean(out)
343
                    loss.backward()
344 345
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
346 347
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
348

349
            # train on static graph mode
350 351 352 353
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
354 355
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
356 357
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
358
                scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
359 360 361 362 363 364
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
365
                for batch_id in range(5):
366 367 368 369 370 371
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
372
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
373 374
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
375 376 377 378 379
    """

    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
        self.boundaries = boundaries
        self.values = values
380
        super(PiecewiseDecay, self).__init__(
381 382 383 384 385 386 387 388 389
            last_epoch=last_epoch, verbose=verbose)

    def get_lr(self):
        for i in range(len(self.boundaries)):
            if self.last_epoch < self.boundaries[i]:
                return self.values[i]
        return self.values[len(self.values) - 1]


390
class NaturalExpDecay(LRScheduler):
391
    r"""
392 393

    Applies natural exponential decay to the initial learning rate.
G
guguguzi 已提交
394

395 396 397 398
    The algorithm can be described as following:

    .. math::

399
        new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
400 401 402

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
403
        gamma (float, optional): A Ratio to update the learning rate, should greater than 0.0 to make learning rate decay. Default: 0.1.
404
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
405
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
406 407

    Returns:
408
        ``NaturalExpDecay`` instance to schedule learning rate.
409 410

    Examples:
G
guguguzi 已提交
411

412 413 414 415 416
        .. code-block:: python

            import paddle
            import numpy as np

417
            # train on default dynamic graph mode
418
            linear = paddle.nn.Linear(10, 10)
419 420
            scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
421
            for epoch in range(20):
Z
Zhou Wei 已提交
422
                for batch_id in range(5):
423
                    x = paddle.uniform([10, 10])
424
                    out = linear(x)
C
chentianyu03 已提交
425
                    loss = paddle.mean(out)
426
                    loss.backward()
427 428
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
429 430
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
431

432
            # train on static graph mode
433 434 435 436
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
437 438
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
439 440
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
441
                scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
442 443 444 445 446 447
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
448
                for batch_id in range(5):
449 450 451 452 453 454
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
455
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
456 457
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
458 459 460
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
461
        assert gamma > 0.0, " 'gamma' must be a positive number so that the learning rate will decay."
462
        self.gamma = gamma
463 464
        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
                                              verbose)
465 466 467 468 469

    def get_lr(self):
        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)


470
class InverseTimeDecay(LRScheduler):
471
    r"""
472 473 474 475 476 477 478

    Applies inverse time decay to the initial learning rate.

    The algorithm can be described as following:

    .. math::

479
        new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
480 481 482

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
483
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
484 485
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
486
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
487 488

    Returns:
489
        ``InverseTimeDecay`` instance to schedule learning rate.
490 491

    Examples:
G
guguguzi 已提交
492

493 494 495 496 497
        .. code-block:: python

            import paddle
            import numpy as np

498
            # train on default dynamic graph mode
499
            linear = paddle.nn.Linear(10, 10)
500 501
            scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
502
            for epoch in range(20):
Z
Zhou Wei 已提交
503
                for batch_id in range(5):
504
                    x = paddle.uniform([10, 10])
505
                    out = linear(x)
C
chentianyu03 已提交
506
                    loss = paddle.mean(out)
507
                    loss.backward()
508 509
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
510 511
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
512

513
            # train on static graph mode
514 515 516 517
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
518 519
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
520 521
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
522
                scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
523 524 525 526 527 528
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
529
                for batch_id in range(5):
530 531 532 533 534 535
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
536
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
537 538
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
539 540 541 542 543

    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
        self.gamma = gamma
544 545
        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
546 547 548 549 550

    def get_lr(self):
        return self.base_lr / (1 + self.gamma * self.last_epoch)


551
class PolynomialDecay(LRScheduler):
552
    r"""
553 554 555 556 557 558 559 560 561

    Applies polynomial decay to the initial learning rate.

    The algorithm can be described as following.

    If cycle is set to True, then:

    .. math::

G
guguguzi 已提交
562
        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
563

564
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
565 566 567 568 569

    If cycle is set to False, then:

    .. math::

G
guguguzi 已提交
570
        epoch & = min(epoch, decay\_steps)
571

572
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
573 574 575 576


    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
577
        decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
578
        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
579
        power(float, optional): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 1.0.
G
guguguzi 已提交
580
        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
581 582
            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
583
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
584 585

    Returns:
586
        ``PolynomialDecay`` instance to schedule learning rate.
587 588

    Examples:
G
guguguzi 已提交
589

590 591 592 593 594
        .. code-block:: python

            import paddle
            import numpy as np

595
            # train on default dynamic graph mode
596
            linear = paddle.nn.Linear(10, 10)
597 598
            scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
599
            for epoch in range(20):
Z
Zhou Wei 已提交
600
                for batch_id in range(5):
601
                    x = paddle.uniform([10, 10])
602
                    out = linear(x)
C
chentianyu03 已提交
603
                    loss = paddle.mean(out)
604
                    loss.backward()
605 606
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
607 608
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
609

610
            # train on static graph mode
611 612 613 614
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
615 616
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
617 618
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
619
                scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
620 621 622 623 624 625
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
626
                for batch_id in range(5):
627 628 629 630 631 632
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
633
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
634 635
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
636 637 638 639 640 641 642 643 644 645
    """

    def __init__(self,
                 learning_rate,
                 decay_steps,
                 end_lr=0.0001,
                 power=1.0,
                 cycle=False,
                 last_epoch=-1,
                 verbose=False):
646 647
        assert decay_steps > 0 and isinstance(
            decay_steps, int), " 'decay_steps' must be a positive integer."
648 649
        self.decay_steps = decay_steps
        self.end_lr = end_lr
650
        assert power > 0.0, " 'power' must be greater than 0.0 so that the learning rate will decay."
651 652
        self.power = power
        self.cycle = cycle
653 654
        super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
                                              verbose)
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673

    def get_lr(self):
        tmp_epoch_num = self.last_epoch
        tmp_decay_steps = self.decay_steps
        if self.cycle:
            div_res = math.ceil(
                float(self.last_epoch) / float(self.decay_steps))

            if self.last_epoch == 0:
                div_res = 1
            tmp_decay_steps = self.decay_steps * div_res
        else:
            tmp_epoch_num = min(self.last_epoch, self.decay_steps)

        return (self.base_lr - self.end_lr) * (
            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)
             )**self.power) + self.end_lr


674
class LinearWarmup(LRScheduler):
675
    r"""
676 677 678

    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
G
guguguzi 已提交
679

680
    When epoch < warmup_steps, learning rate is updated as:
G
guguguzi 已提交
681

682
    .. math::
G
guguguzi 已提交
683

684
            lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
G
guguguzi 已提交
685

686
    where start_lr is the initial learning rate, and end_lr is the final learning rate;
G
guguguzi 已提交
687

688
    When epoch >= warmup_steps, learning rate is updated as:
G
guguguzi 已提交
689

690
    .. math::
G
guguguzi 已提交
691

692
            lr = learning_rate
G
guguguzi 已提交
693

694
    where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
695 696

    Args:
697
        learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
698
        warmup_steps (int): total steps of warm up. It must be a positive integer.
699 700 701
        start_lr (float): Initial learning rate of warm up.
        end_lr (float): Final learning rate of warm up.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
702
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
703 704

    Returns:
705
        ``LinearWarmup`` instance to schedule learning rate.
706 707

    Examples:
G
guguguzi 已提交
708

709 710 711 712 713
        .. code-block:: python

            import paddle
            import numpy as np

714
            # train on default dynamic graph mode
715
            linear = paddle.nn.Linear(10, 10)
716
            scheduler = paddle.optimizer.lr.LinearWarmup(
717
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
718
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
719
            for epoch in range(20):
Z
Zhou Wei 已提交
720
                for batch_id in range(5):
721
                    x = paddle.uniform([10, 10])
722
                    out = linear(x)
C
chentianyu03 已提交
723
                    loss = paddle.mean(out)
724
                    loss.backward()
725 726
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
727 728
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
729

730
            # train on static graph mode
731 732 733 734
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
735 736
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
737 738
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
739
                scheduler = paddle.optimizer.lr.LinearWarmup(
740 741 742 743 744 745 746
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
747
                for batch_id in range(5):
748 749 750 751 752 753
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
754
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
755 756
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
757 758 759 760 761 762 763 764 765 766
    """

    def __init__(self,
                 learning_rate,
                 warmup_steps,
                 start_lr,
                 end_lr,
                 last_epoch=-1,
                 verbose=False):
        type_check = isinstance(learning_rate, float) or isinstance(
767
            learning_rate, int) or isinstance(learning_rate, LRScheduler)
768 769
        if not type_check:
            raise TypeError(
770
                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".
771 772
                format(learning_rate))
        self.learning_rate = learning_rate
773 774
        assert warmup_steps > 0 and isinstance(
            warmup_steps, int), " 'warmup_steps' must be a positive integer."
775 776 777 778 779
        self.warmup_steps = warmup_steps
        self.start_lr = start_lr
        self.end_lr = end_lr
        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
            end_lr, start_lr)
780
        super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
781

782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
    def state_dict(self):
        """
        Returns the state of the LinearWarmup scheduler as a :class:`dict`.

        It is a subset of ``self.__dict__`` .
        """
        state_dict = super(LinearWarmup, self).state_dict()
        if isinstance(self.learning_rate, LRScheduler):
            state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
        return state_dict

    def set_state_dict(self, state_dict):
        """
        Loads state_dict for LinearWarmup scheduler.
        """
        super(LinearWarmup, self).set_state_dict(state_dict)
        if isinstance(self.learning_rate, LRScheduler):
            self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])

801 802 803 804 805
    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return (self.end_lr - self.start_lr) * float(
                self.last_epoch) / float(self.warmup_steps) + self.start_lr
        else:
806
            if isinstance(self.learning_rate, LRScheduler):
807 808
                self.learning_rate.step(self.last_epoch - self.warmup_steps)
                return self.learning_rate()
809 810 811 812

            return self.learning_rate


813
class ExponentialDecay(LRScheduler):
814
    r"""
815

816
    Update learning rate by `gamma` each epoch.
817 818

    The algorithm can be described as following.
G
guguguzi 已提交
819

820 821 822 823 824 825
    .. math::

        new\_learning\_rate = last\_learning\_rate * gamma

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
826
        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
827
            It should be in interval (0.0, 1.0).
828
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
829
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
830 831

    Returns:
832
        ``ExponentialDecay`` instance to schedule learning rate.
833 834

    Examples:
G
guguguzi 已提交
835

836 837 838 839 840
        .. code-block:: python

            import paddle
            import numpy as np

841
            # train on default dynamic graph mode
842
            linear = paddle.nn.Linear(10, 10)
843 844
            scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
845
            for epoch in range(20):
Z
Zhou Wei 已提交
846
                for batch_id in range(5):
847
                    x = paddle.uniform([10, 10])
848
                    out = linear(x)
C
chentianyu03 已提交
849
                    loss = paddle.mean(out)
850
                    loss.backward()
851 852
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
853 854
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
855

856
            # train on static graph mode
857 858 859 860
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
861 862
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
863 864
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
865
                scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
866 867 868 869 870 871
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
872
                for batch_id in range(5):
873 874 875 876 877 878
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
879
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
880 881
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
882 883 884
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
885
        assert gamma > 0.0 and gamma < 1.0, " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
886
        self.gamma = gamma
887 888
        super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
                                               verbose)
889 890 891 892 893

    def get_lr(self):
        return self.base_lr * (self.gamma**self.last_epoch)


894
class MultiStepDecay(LRScheduler):
895
    """
896
    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
897

G
guguguzi 已提交
898
    The algorithm can be described as the code below.
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914

    .. code-block:: text

        learning_rate = 0.5
        milestones = [30, 50]
        gamma = 0.1
        if epoch < 30:
            learning_rate = 0.5
        elif epoch < 50:
            learning_rate = 0.05
        else:
            learning_rate = 0.005

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
G
guguguzi 已提交
915
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
916 917
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
918
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
919

920 921

    Returns:
922
        ``MultiStepDecay`` instance to schedule learning rate.
923 924

    Examples:
G
guguguzi 已提交
925

926 927 928 929 930
        .. code-block:: python

            import paddle
            import numpy as np

931
            # train on default dynamic graph mode
932
            linear = paddle.nn.Linear(10, 10)
933 934
            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
935
            for epoch in range(20):
Z
Zhou Wei 已提交
936
                for batch_id in range(5):
937
                    x = paddle.uniform([10, 10])
938
                    out = linear(x)
C
chentianyu03 已提交
939
                    loss = paddle.mean(out)
940
                    loss.backward()
941 942
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
943 944
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
945

946
            # train on static graph mode
947 948 949 950
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
951 952
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
953 954
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
955
                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
956 957 958 959 960 961
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
962
                for batch_id in range(5):
963 964 965 966 967 968
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
969
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
970 971
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
    """

    def __init__(self,
                 learning_rate,
                 milestones,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(milestones, (tuple, list)):
            raise TypeError(
                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
                % type(milestones))

        if not all([
                milestones[i] < milestones[i + 1]
                for i in range(len(milestones) - 1)
        ]):
            raise ValueError('The elements of milestones must be incremented')
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

        self.milestones = milestones
        self.gamma = gamma
995
        super(MultiStepDecay, self).__init__(learning_rate, last_epoch, verbose)
996 997 998 999 1000 1001 1002 1003

    def get_lr(self):
        for i in range(len(self.milestones)):
            if self.last_epoch < self.milestones[i]:
                return self.base_lr * (self.gamma**i)
        return self.base_lr * (self.gamma**len(self.milestones))


1004
class StepDecay(LRScheduler):
1005 1006 1007
    """
    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.

G
guguguzi 已提交
1008
    The algorithm can be described as the code below.
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022

    .. code-block:: text

        learning_rate = 0.5
        step_size = 30
        gamma = 0.1

        learning_rate = 0.5     if epoch < 30
        learning_rate = 0.05    if 30 <= epoch < 60
        learning_rate = 0.005   if 60 <= epoch < 90
        ...

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
1023
        step_size (int): the interval to update. It must be a positive integer.
G
guguguzi 已提交
1024
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
1025 1026
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1027
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1028 1029

    Returns:
1030
        ``StepDecay`` instance to schedule learning rate.
1031 1032 1033


    Examples:
G
guguguzi 已提交
1034

1035 1036 1037 1038 1039
        .. code-block:: python

            import paddle
            import numpy as np

1040
            # train on default dynamic graph mode
1041
            linear = paddle.nn.Linear(10, 10)
1042 1043
            scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1044
            for epoch in range(20):
Z
Zhou Wei 已提交
1045
                for batch_id in range(5):
1046
                    x = paddle.uniform([10, 10])
1047
                    out = linear(x)
C
chentianyu03 已提交
1048
                    loss = paddle.mean(out)
1049
                    loss.backward()
1050 1051
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1052 1053
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1054

1055
            # train on static graph mode
1056 1057 1058 1059
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1060 1061
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1062 1063
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1064
                scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
1065 1066 1067 1068 1069 1070
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1071
                for batch_id in range(5):
1072 1073 1074 1075 1076 1077
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1078
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1079 1080
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
    """

    def __init__(self,
                 learning_rate,
                 step_size,
                 gamma=0.1,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(step_size, int):
            raise TypeError(
                "The type of 'step_size' must be 'int', but received %s." %
                type(step_size))
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

1096 1097
        assert step_size > 0 and isinstance(
            step_size, int), " 'step_size' must be a positive integer."
1098 1099
        self.step_size = step_size
        self.gamma = gamma
1100
        super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
1101 1102 1103 1104 1105 1106

    def get_lr(self):
        i = self.last_epoch // self.step_size
        return self.base_lr * (self.gamma**i)


1107
class LambdaDecay(LRScheduler):
1108 1109 1110
    """
    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .

G
guguguzi 已提交
1111
    The algorithm can be described as the code below.
1112 1113 1114 1115 1116 1117

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95 ** epoch

1118 1119 1120
        learning_rate = 0.5        # epoch 0, 0.5*0.95**0
        learning_rate = 0.475      # epoch 1, 0.5*0.95**1
        learning_rate = 0.45125    # epoch 2, 0.5*0.95**2
1121 1122 1123 1124 1125

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1126
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
1127

1128
    Returns:
1129
        ``LambdaDecay`` instance to schedule learning rate.
1130 1131

    Examples:
G
guguguzi 已提交
1132

1133 1134 1135 1136 1137
        .. code-block:: python

            import paddle
            import numpy as np

1138
            # train on default dynamic graph mode
1139
            linear = paddle.nn.Linear(10, 10)
1140 1141
            scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1142
            for epoch in range(20):
Z
Zhou Wei 已提交
1143
                for batch_id in range(5):
1144
                    x = paddle.uniform([10, 10])
1145
                    out = linear(x)
C
chentianyu03 已提交
1146
                    loss = paddle.mean(out)
1147
                    loss.backward()
1148 1149
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1150 1151
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1152

1153
            # train on static graph mode
1154 1155 1156 1157
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1158 1159
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1160 1161
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1162
                scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
1163 1164 1165 1166 1167 1168
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1169
                for batch_id in range(5):
1170 1171 1172 1173 1174 1175
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1176
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1177 1178
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1179 1180 1181 1182 1183 1184

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
1185
                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
1186 1187 1188
                % type(lr_lambda))

        self.lr_lambda = lr_lambda
1189
        super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
1190 1191 1192 1193 1194

    def get_lr(self):
        return self.base_lr * self.lr_lambda(self.last_epoch)


1195
class ReduceOnPlateau(LRScheduler):
1196
    """
G
guguguzi 已提交
1197
    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
1198 1199
    by 2 to 10 times once model performance has no longer improvement.

G
guguguzi 已提交
1200 1201 1202
    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
1203 1204 1205 1206 1207 1208
    number of epochs, the learning rate will be reduced.)

    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
1209 1210
        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
1211
            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
G
guguguzi 已提交
1212
        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
1213
            It should be less than 1.0. Default: 0.1.
G
guguguzi 已提交
1214
        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
1215
            Default: 10.
G
guguguzi 已提交
1216
        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
1217 1218
            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
G
guguguzi 已提交
1219
            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
1220 1221 1222
            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
G
guguguzi 已提交
1223
        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
1224
            the update is ignored. Default: 1e-8.
1225 1226
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.

G
guguguzi 已提交
1227

1228
    Returns:
1229
        ``ReduceOnPlateau`` instance to schedule learning rate.
1230 1231 1232 1233 1234 1235 1236 1237


    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

1238
            # train on default dynamic graph mode
1239
            linear = paddle.nn.Linear(10, 10)
1240 1241
            scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1242
            for epoch in range(20):
Z
Zhou Wei 已提交
1243
                for batch_id in range(5):
1244
                    x = paddle.uniform([10, 10])
1245
                    out = linear(x)
C
chentianyu03 已提交
1246
                    loss = paddle.mean(out)
1247
                    loss.backward()
1248 1249
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1250 1251
                    scheduler.step(loss)    # If you update learning rate each step
              # scheduler.step(loss)        # If you update learning rate each epoch
1252

1253
            # train on static graph mode
1254 1255 1256 1257
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1258 1259
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1260 1261
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1262
                scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
1263 1264 1265 1266 1267 1268
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1269
                for batch_id in range(5):
1270 1271 1272 1273 1274 1275
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1276
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1277 1278
                    scheduler.step(out[0])    # If you update learning rate each step
              # scheduler.step(out[0])        # If you update learning rate each epoch
1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309

    """

    def __init__(self,
                 learning_rate,
                 mode='min',
                 factor=0.1,
                 patience=10,
                 threshold=1e-4,
                 threshold_mode='rel',
                 cooldown=0,
                 min_lr=0,
                 epsilon=1e-8,
                 verbose=False):
        mode = mode.lower()
        if mode not in ['min', 'max']:
            raise ValueError('mode: ' + mode + ' is unknown!')
        self.mode = mode

        if factor >= 1.0:
            raise ValueError(
                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
        self.factor = factor

        threshold_mode = threshold_mode.lower()
        if threshold_mode not in ['rel', 'abs']:
            raise ValueError('threshold mode: ' + threshold_mode +
                             ' is unknown!')
        self.threshold_mode = threshold_mode
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
1310
                "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331
                % type(learning_rate))

        self.patience = patience
        self.threshold = threshold
        self.threshold_mode = threshold_mode
        self.cooldown = cooldown
        self.min_lr = min_lr
        self.epsilon = epsilon

        self.cooldown_counter = 0
        self.best = None
        self.num_bad_epochs = 0

        # Can not call Parent __init__, so implement here.
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = 0
        self.verbose = verbose
        self._var_name = None

    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
1332
    def state_keys(self):
1333 1334 1335 1336 1337 1338 1339
        self.keys = [
            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
            'last_lr'
        ]

    def step(self, metrics, epoch=None):
        """
G
guguguzi 已提交
1340
        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
1341 1342 1343
        The new learning rate will take effect on next epoch.

        Args:
G
guguguzi 已提交
1344
            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
1345 1346 1347 1348 1349 1350
                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                'numpy.ndarray', its shape must be [1].
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
G
guguguzi 已提交
1351

1352
        Examples:
1353
            Please refer to the example of current LRScheduler.
1354 1355 1356 1357 1358 1359
        """
        if epoch is None:
            self.last_epoch = self.last_epoch + 1
        else:
            self.last_epoch = epoch

1360 1361 1362 1363
        if _in_eager_mode():
            tmp = core.eager.EagerTensor
        else:
            tmp = Tensor
1364
        # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
1365
        if isinstance(metrics, (tmp, numpy.ndarray)):
1366
            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
G
guguguzi 已提交
1367 1368 1369
                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
                                                                      "you should call paddle.mean to process it first.".format(
                metrics.shape)
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409
        elif not isinstance(metrics,
                            (int, float, numpy.float32, numpy.float64)):
            raise TypeError(
                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".
                format(type(metrics)))

        if self.cooldown_counter > 0:
            self.cooldown_counter -= 1
        else:
            if self.best is None or self._is_better(metrics, self.best):
                self.best = metrics
                self.num_bad_epochs = 0
            else:
                self.num_bad_epochs += 1

            if self.num_bad_epochs > self.patience:
                self.cooldown_counter = self.cooldown
                self.num_bad_epochs = 0
                new_lr = max(self.last_lr * self.factor, self.min_lr)
                if self.last_lr - new_lr > self.epsilon:
                    self.last_lr = new_lr
                    if self.verbose:
                        print('Epoch {}: {} set learning rate to {}.'.format(
                            self.last_epoch, self.__class__.__name__,
                            self.last_lr))

    def _is_better(self, current, best):
        if self.mode == 'min' and self.threshold_mode == 'rel':
            return current < best - best * self.threshold

        elif self.mode == 'min' and self.threshold_mode == 'abs':
            return current < best - self.threshold

        elif self.mode == 'max' and self.threshold_mode == 'rel':
            return current > best + best * self.threshold

        else:
            return current > best + self.threshold


1410
class CosineAnnealingDecay(LRScheduler):
1411
    r"""
1412

G
guguguzi 已提交
1413 1414
    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
1415
    SGDR.
1416 1417 1418 1419

    The algorithm can be described as following.

    .. math::
1420

1421 1422
        \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
        + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
G
guguguzi 已提交
1423
        & T_{cur} \neq (2k+1)T_{max};
1424 1425 1426 1427

        \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
        \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
        & T_{cur} = (2k+1)T_{max}.
G
guguguzi 已提交
1428 1429

    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
1430
    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
G
guguguzi 已提交
1431

1432 1433
    Args:
        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
1434
        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
1435 1436
        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1437
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1438 1439

    Returns:
1440
        ``CosineAnnealingDecay`` instance to schedule learning rate.
1441 1442

    Examples:
G
guguguzi 已提交
1443

1444 1445 1446 1447 1448
        .. code-block:: python

            import paddle
            import numpy as np

1449
            # train on default dynamic graph mode
1450
            linear = paddle.nn.Linear(10, 10)
1451 1452
            scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1453
            for epoch in range(20):
Z
Zhou Wei 已提交
1454
                for batch_id in range(5):
1455
                    x = paddle.uniform([10, 10])
1456
                    out = linear(x)
C
chentianyu03 已提交
1457
                    loss = paddle.mean(out)
1458
                    loss.backward()
1459 1460
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1461 1462
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1463

1464
            # train on static graph mode
1465 1466 1467 1468
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1469 1470
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1471 1472
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1473
                scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
1474 1475 1476 1477 1478 1479
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1480
                for batch_id in range(5):
1481 1482 1483 1484 1485 1486
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1487
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1488 1489
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
    """

    def __init__(self,
                 learning_rate,
                 T_max,
                 eta_min=0,
                 last_epoch=-1,
                 verbose=False):
        if not isinstance(T_max, int):
            raise TypeError(
1500
                "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
1501 1502 1503
                % type(T_max))
        if not isinstance(eta_min, (float, int)):
            raise TypeError(
1504
                "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
1505
                % type(eta_min))
1506 1507
        assert T_max > 0 and isinstance(
            T_max, int), " 'T_max' must be a positive integer."
1508 1509
        self.T_max = T_max
        self.eta_min = float(eta_min)
1510 1511
        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
                                                   verbose)
1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526

    def get_lr(self):
        if self.last_epoch == 0:
            return self.base_lr
        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
            return self.last_lr + (self.base_lr - self.eta_min) * (1 - math.cos(
                math.pi / self.T_max)) / 2

        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
                self.last_lr - self.eta_min) + self.eta_min

    def _get_closed_form_lr(self):
        return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
            math.pi * self.last_epoch / self.T_max)) / 2
G
guguguzi 已提交
1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591


class MultiplicativeDecay(LRScheduler):
    """
    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95

        learning_rate = 0.5        # epoch 0,
        learning_rate = 0.475      # epoch 1, 0.5*0.95
        learning_rate = 0.45125    # epoch 2, 0.475*0.95

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``MultiplicativeDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
                "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
                % type(lr_lambda))

        self.lr_lambda = lr_lambda
        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
                                                  verbose)

    def get_lr(self):
        if self.last_epoch > 0:
            return self.last_lr * self.lr_lambda(self.last_epoch)
        else:
            return self.base_lr