lr.py 83.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import numpy
import warnings
from paddle import Tensor
19
import paddle.fluid.core as core
J
Jiabin Yang 已提交
20
from ..fluid.framework import _in_legacy_dygraph
21

G
guguguzi 已提交
22
__all__ = [  # noqa
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
    'LRScheduler',
    'NoamDecay',
    'PiecewiseDecay',
    'NaturalExpDecay',
    'InverseTimeDecay',
    'PolynomialDecay',
    'LinearWarmup',
    'ExponentialDecay',
    'MultiStepDecay',
    'StepDecay',
    'LambdaDecay',
    'ReduceOnPlateau',
    'CosineAnnealingDecay',
    'MultiplicativeDecay',
    'OneCycleLR',
    'CyclicLR',
39 40 41
]


42 43 44 45 46
class LRScheduler(object):
    """

    LRScheduler Base class. Define the common interface of a learning rate scheduler.

Z
Zhou Wei 已提交
47
    User can import it by ``from paddle.optimizer.lr import LRScheduler`` ,
48 49 50 51 52 53 54 55 56 57 58 59 60 61

    then overload it for your subclass and have a custom implementation of ``get_lr()`` .

    Otherwise, an ``NotImplementedError`` exception will be thrown.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        instance to schedule learning rate.

    Examples:
62
        Here is an example of a simple ``StepDecay`` implementation.
G
guguguzi 已提交
63

64
        .. code-block:: python
G
guguguzi 已提交
65

66
            import paddle
Z
Zhou Wei 已提交
67
            from paddle.optimizer.lr import LRScheduler
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84

            class StepDecay(LRScheduler):
                def __init__(self,
                            learning_rate,
                            step_size,
                            gamma=0.1,
                            last_epoch=-1,
                            verbose=False):
                    if not isinstance(step_size, int):
                        raise TypeError(
                            "The type of 'step_size' must be 'int', but received %s." %
                            type(step_size))
                    if gamma >= 1.0:
                        raise ValueError('gamma should be < 1.0.')

                    self.step_size = step_size
                    self.gamma = gamma
85
                    super().__init__(learning_rate, last_epoch, verbose)
86 87 88 89

                def get_lr(self):
                    i = self.last_epoch // self.step_size
                    return self.base_lr * (self.gamma**i)
90 91 92 93 94 95

    """

    def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
96 97 98 99
                "The type of learning rate must be float, but received {}".format(
                    type(learning_rate)
                )
            )
100 101 102 103 104 105 106 107 108
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = last_epoch
        self.verbose = verbose
        self._var_name = None

        self.step()

    def __call__(self):
G
guguguzi 已提交
109
        """
110
        Return lastest computed learning rate on current epoch.
111 112 113 114 115
        """
        return self.last_lr

    def step(self, epoch=None):
        """
116

G
guguguzi 已提交
117
        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
118
        The new learning rate will take effect on next ``optimizer.step`` .
119 120 121 122 123 124

        Args:
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
125

126 127 128 129 130 131 132 133 134 135 136 137
        """
        if epoch is None:
            self.last_epoch += 1
            self.last_lr = self.get_lr()
        else:
            self.last_epoch = epoch
            if hasattr(self, "_get_closed_form_lr"):
                self.last_lr = self._get_closed_form_lr()
            else:
                self.last_lr = self.get_lr()

        if self.verbose:
138 139 140 141 142
            print(
                'Epoch {}: {} set learning rate to {}.'.format(
                    self.last_epoch, self.__class__.__name__, self.last_lr
                )
            )
143 144 145

    def state_dict(self):
        """
146

147 148
        Returns the state of the scheduler as a :class:`dict`.

149
        It is a subset of ``self.__dict__`` .
150
        """
151
        self.state_keys()
152 153 154 155 156 157 158 159 160
        state_dict = {}
        for key in self.keys:
            if key not in self.__dict__:
                continue
            value = self.__dict__[key]
            if isinstance(value, Tensor):
                assert value.shape == [
                    1
                ], "shape of Tensor in state_dict must be [1] {}".format(
161 162
                    value.shape
                )
163 164 165 166 167
                value = value.numpy()[0]
            state_dict[key] = value

        return state_dict

168
    # For those subclass who overload LRScheduler, "last_epoch, last_lr" will be saved by default.
169
    # (Note): you can change it for your subclass.
170
    def state_keys(self):
171
        """
172 173 174 175 176 177 178

        For those subclass who overload ``LRScheduler`` (Base Class). Acquiescently, "last_epoch, last_lr" will be saved by ``self.keys = ['last_epoch', 'last_lr']`` .

        ``last_epoch`` is the current epoch num, and ``last_lr`` is the current learning rate.

        If you want to change the default behavior, you should have a custom implementation of ``_state_keys()`` to redefine ``self.keys`` .

179 180 181
        """
        self.keys = ['last_epoch', 'last_lr']

182
    def set_state_dict(self, state_dict):
183
        """
184

185 186
        Loads the schedulers state.
        """
187
        self.state_keys()
188 189 190 191 192
        for key in self.keys:
            if key in state_dict:
                self.__dict__[key] = state_dict[key]
            else:
                raise RuntimeError(
193 194 195 196
                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".format(
                        key
                    )
                )
197 198 199 200 201
        if len(state_dict) > len(self.keys):
            warnings.warn(
                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
            )

202 203
    # alias for set_state_dict
    set_dict = set_state_dict
204 205

    def get_lr(self):
206
        """
G
guguguzi 已提交
207

208 209 210 211
        For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .

        Otherwise, an ``NotImplementedError`` exception will be thrown.
        """
212 213 214 215
        # calculate by python float
        raise NotImplementedError


216
class NoamDecay(LRScheduler):
217
    r"""
218

G
guguguzi 已提交
219
    Applies Noam Decay to the initial learning rate.
220 221 222 223 224 225 226

    The algorithm can be described as following.

    .. math::

        new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})

G
guguguzi 已提交
227
    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
228 229 230 231 232 233 234


    Args:
        d$_{model}$(int): The dimensionality of input and output feature vector of model. It is a python int number.
        warmup_steps(int): The number of warmup steps. A super parameter. It is a python int number
        learning_rate (float): The initial learning rate. It is a python float number. Default: 1.0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
235
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
236 237

    Returns:
238
        ``NoamDecay`` instance to schedule learning rate.
239 240 241 242 243 244 245

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

246
            # train on default dynamic graph mode
247
            linear = paddle.nn.Linear(10, 10)
248 249
            scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
250
            for epoch in range(20):
Z
Zhou Wei 已提交
251
                for batch_id in range(5):
252
                    x = paddle.uniform([10, 10])
253
                    out = linear(x)
C
chentianyu03 已提交
254
                    loss = paddle.mean(out)
255
                    loss.backward()
256 257
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
258 259
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
260

261
            # train on static graph mode
262 263 264 265
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
266 267
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
268 269
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
270
                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
271 272 273 274 275 276
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
277
                for batch_id in range(5):
278 279 280 281 282 283
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
284
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
285 286
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
287 288 289

    """

290 291 292 293 294 295 296 297
    def __init__(
        self,
        d_model,
        warmup_steps,
        learning_rate=1.0,
        last_epoch=-1,
        verbose=False,
    ):
298 299
        self.d_model = d_model
        self.warmup_steps = warmup_steps
300
        super().__init__(learning_rate, last_epoch, verbose)
301 302 303 304 305 306 307 308 309 310

    def get_lr(self):
        if self.last_epoch == 0:
            a = 1
        else:
            a = self.last_epoch**-0.5
        b = self.warmup_steps**-1.5 * self.last_epoch
        return self.base_lr * (self.d_model**-0.5) * min(a, b)


311
class PiecewiseDecay(LRScheduler):
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
    """

    Piecewise learning rate scheduler.

    The algorithm can be described as the code below:

    .. code-block:: text

        boundaries = [100, 200]
        values = [1.0, 0.5, 0.1]
        if epoch < 100:
            learning_rate = 1.0
        elif 100 <= global_step < 200:
            learning_rate = 0.5
        else:
            learning_rate = 0.1

    Args:
G
guguguzi 已提交
330 331
        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
332 333
            The type of element in the list is python float.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
334
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
335 336

    Returns:
337
        ``PiecewiseDecay`` instance to schedule learning rate.
338 339

    Examples:
G
guguguzi 已提交
340

341 342 343 344 345
        .. code-block:: python

            import paddle
            import numpy as np

346
            # train on default dynamic graph mode
347
            linear = paddle.nn.Linear(10, 10)
348 349
            scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
350
            for epoch in range(20):
Z
Zhou Wei 已提交
351
                for batch_id in range(5):
352
                    x = paddle.uniform([10, 10])
353
                    out = linear(x)
C
chentianyu03 已提交
354
                    loss = paddle.mean(out)
355
                    loss.backward()
356 357
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
358 359
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
360

361
            # train on static graph mode
362 363 364 365
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
366 367
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
368 369
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
370
                scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
371 372 373 374 375 376
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
377
                for batch_id in range(5):
378 379 380 381 382 383
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
384
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
385 386
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
387 388 389 390 391
    """

    def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
        self.boundaries = boundaries
        self.values = values
392
        super().__init__(last_epoch=last_epoch, verbose=verbose)
393 394 395 396 397 398 399 400

    def get_lr(self):
        for i in range(len(self.boundaries)):
            if self.last_epoch < self.boundaries[i]:
                return self.values[i]
        return self.values[len(self.values) - 1]


401
class NaturalExpDecay(LRScheduler):
402
    r"""
403 404

    Applies natural exponential decay to the initial learning rate.
G
guguguzi 已提交
405

406 407 408 409
    The algorithm can be described as following:

    .. math::

410
        new\_learning\_rate = learning\_rate * e^{- gamma * epoch}
411 412 413

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
414
        gamma (float, optional): A Ratio to update the learning rate, should greater than 0.0 to make learning rate decay. Default: 0.1.
415
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
416
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
417 418

    Returns:
419
        ``NaturalExpDecay`` instance to schedule learning rate.
420 421

    Examples:
G
guguguzi 已提交
422

423 424 425 426 427
        .. code-block:: python

            import paddle
            import numpy as np

428
            # train on default dynamic graph mode
429
            linear = paddle.nn.Linear(10, 10)
430 431
            scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
432
            for epoch in range(20):
Z
Zhou Wei 已提交
433
                for batch_id in range(5):
434
                    x = paddle.uniform([10, 10])
435
                    out = linear(x)
C
chentianyu03 已提交
436
                    loss = paddle.mean(out)
437
                    loss.backward()
438 439
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
440 441
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
442

443
            # train on static graph mode
444 445 446 447
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
448 449
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
450 451
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
452
                scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
453 454 455 456 457 458
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
459
                for batch_id in range(5):
460 461 462 463 464 465
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
466
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
467 468
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
469 470 471
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
472 473 474
        assert (
            gamma > 0.0
        ), " 'gamma' must be a positive number so that the learning rate will decay."
475
        self.gamma = gamma
476
        super().__init__(learning_rate, last_epoch, verbose)
477 478 479 480 481

    def get_lr(self):
        return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)


482
class InverseTimeDecay(LRScheduler):
483
    r"""
484 485 486 487 488 489 490

    Applies inverse time decay to the initial learning rate.

    The algorithm can be described as following:

    .. math::

491
        new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
492 493 494

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
495
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
496 497
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
498
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
499 500

    Returns:
501
        ``InverseTimeDecay`` instance to schedule learning rate.
502 503

    Examples:
G
guguguzi 已提交
504

505 506 507 508 509
        .. code-block:: python

            import paddle
            import numpy as np

510
            # train on default dynamic graph mode
511
            linear = paddle.nn.Linear(10, 10)
512 513
            scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
514
            for epoch in range(20):
Z
Zhou Wei 已提交
515
                for batch_id in range(5):
516
                    x = paddle.uniform([10, 10])
517
                    out = linear(x)
C
chentianyu03 已提交
518
                    loss = paddle.mean(out)
519
                    loss.backward()
520 521
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
522 523
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
524

525
            # train on static graph mode
526 527 528 529
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
530 531
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
532 533
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
534
                scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
535 536 537 538 539 540
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
541
                for batch_id in range(5):
542 543 544 545 546 547
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
548
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
549 550
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
551 552 553 554 555

    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
        self.gamma = gamma
556
        super().__init__(learning_rate, last_epoch, verbose)
557 558 559 560 561

    def get_lr(self):
        return self.base_lr / (1 + self.gamma * self.last_epoch)


562
class PolynomialDecay(LRScheduler):
563
    r"""
564 565 566 567 568 569 570 571 572

    Applies polynomial decay to the initial learning rate.

    The algorithm can be described as following.

    If cycle is set to True, then:

    .. math::

G
guguguzi 已提交
573
        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
574

575
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
576 577 578 579 580

    If cycle is set to False, then:

    .. math::

G
guguguzi 已提交
581
        epoch & = min(epoch, decay\_steps)
582

583
        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
584 585 586 587


    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
588
        decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
589
        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
590
        power(float, optional): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 1.0.
G
guguguzi 已提交
591
        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
592 593
            to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
594
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
595 596

    Returns:
597
        ``PolynomialDecay`` instance to schedule learning rate.
598 599

    Examples:
G
guguguzi 已提交
600

601 602 603 604 605
        .. code-block:: python

            import paddle
            import numpy as np

606
            # train on default dynamic graph mode
607
            linear = paddle.nn.Linear(10, 10)
608 609
            scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
610
            for epoch in range(20):
Z
Zhou Wei 已提交
611
                for batch_id in range(5):
612
                    x = paddle.uniform([10, 10])
613
                    out = linear(x)
C
chentianyu03 已提交
614
                    loss = paddle.mean(out)
615
                    loss.backward()
616 617
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
618 619
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
620

621
            # train on static graph mode
622 623 624 625
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
626 627
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
628 629
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
630
                scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
631 632 633 634 635 636
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
637
                for batch_id in range(5):
638 639 640 641 642 643
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
644
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
645 646
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
647 648
    """

649 650 651 652 653 654 655 656 657 658
    def __init__(
        self,
        learning_rate,
        decay_steps,
        end_lr=0.0001,
        power=1.0,
        cycle=False,
        last_epoch=-1,
        verbose=False,
    ):
659
        assert decay_steps > 0 and isinstance(
660 661
            decay_steps, int
        ), " 'decay_steps' must be a positive integer."
662 663
        self.decay_steps = decay_steps
        self.end_lr = end_lr
664 665 666
        assert (
            power > 0.0
        ), " 'power' must be greater than 0.0 so that the learning rate will decay."
667 668
        self.power = power
        self.cycle = cycle
669
        super().__init__(learning_rate, last_epoch, verbose)
670 671 672 673 674 675

    def get_lr(self):
        tmp_epoch_num = self.last_epoch
        tmp_decay_steps = self.decay_steps
        if self.cycle:
            div_res = math.ceil(
676 677
                float(self.last_epoch) / float(self.decay_steps)
            )
678 679 680 681 682 683 684 685

            if self.last_epoch == 0:
                div_res = 1
            tmp_decay_steps = self.decay_steps * div_res
        else:
            tmp_epoch_num = min(self.last_epoch, self.decay_steps)

        return (self.base_lr - self.end_lr) * (
686 687
            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)) ** self.power
        ) + self.end_lr
688 689


690
class LinearWarmup(LRScheduler):
691
    r"""
692 693 694

    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
G
guguguzi 已提交
695

696
    When epoch < warmup_steps, learning rate is updated as:
G
guguguzi 已提交
697

698
    .. math::
G
guguguzi 已提交
699

700
            lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
G
guguguzi 已提交
701

702
    where start_lr is the initial learning rate, and end_lr is the final learning rate;
G
guguguzi 已提交
703

704
    When epoch >= warmup_steps, learning rate is updated as:
G
guguguzi 已提交
705

706
    .. math::
G
guguguzi 已提交
707

708
            lr = learning_rate
G
guguguzi 已提交
709

710
    where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
711 712

    Args:
713
        learning_rate (float|LRScheduler): The learning rate after warm-up. It is a python float number or any subclass of ``LRScheduler`` .
714
        warmup_steps (int): total steps of warm up. It must be a positive integer.
715 716 717
        start_lr (float): Initial learning rate of warm up.
        end_lr (float): Final learning rate of warm up.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
718
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
719 720

    Returns:
721
        ``LinearWarmup`` instance to schedule learning rate.
722 723

    Examples:
G
guguguzi 已提交
724

725 726 727 728 729
        .. code-block:: python

            import paddle
            import numpy as np

730
            # train on default dynamic graph mode
731
            linear = paddle.nn.Linear(10, 10)
732
            scheduler = paddle.optimizer.lr.LinearWarmup(
733
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
734
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
735
            for epoch in range(20):
Z
Zhou Wei 已提交
736
                for batch_id in range(5):
737
                    x = paddle.uniform([10, 10])
738
                    out = linear(x)
C
chentianyu03 已提交
739
                    loss = paddle.mean(out)
740
                    loss.backward()
741 742
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
743 744
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
745

746
            # train on static graph mode
747 748 749 750
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
751 752
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
753 754
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
755
                scheduler = paddle.optimizer.lr.LinearWarmup(
756 757 758 759 760 761 762
                    learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
763
                for batch_id in range(5):
764 765 766 767 768 769
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
770
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
771 772
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
773 774
    """

775 776 777 778 779 780 781 782 783 784 785 786 787 788
    def __init__(
        self,
        learning_rate,
        warmup_steps,
        start_lr,
        end_lr,
        last_epoch=-1,
        verbose=False,
    ):
        type_check = (
            isinstance(learning_rate, float)
            or isinstance(learning_rate, int)
            or isinstance(learning_rate, LRScheduler)
        )
789 790
        if not type_check:
            raise TypeError(
791 792 793 794
                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".format(
                    learning_rate
                )
            )
795
        self.learning_rate = learning_rate
796
        assert warmup_steps > 0 and isinstance(
797 798
            warmup_steps, int
        ), " 'warmup_steps' must be a positive integer."
799 800 801
        self.warmup_steps = warmup_steps
        self.start_lr = start_lr
        self.end_lr = end_lr
802 803 804
        assert (
            end_lr > start_lr
        ), "end_lr {} must be greater than start_lr {}".format(end_lr, start_lr)
805
        super().__init__(start_lr, last_epoch, verbose)
806

807 808 809 810 811 812
    def state_dict(self):
        """
        Returns the state of the LinearWarmup scheduler as a :class:`dict`.

        It is a subset of ``self.__dict__`` .
        """
813
        state_dict = super().state_dict()
814 815 816 817 818 819 820 821
        if isinstance(self.learning_rate, LRScheduler):
            state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
        return state_dict

    def set_state_dict(self, state_dict):
        """
        Loads state_dict for LinearWarmup scheduler.
        """
822
        super().set_state_dict(state_dict)
823 824 825
        if isinstance(self.learning_rate, LRScheduler):
            self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])

826 827 828
    def get_lr(self):
        if self.last_epoch < self.warmup_steps:
            return (self.end_lr - self.start_lr) * float(
829 830
                self.last_epoch
            ) / float(self.warmup_steps) + self.start_lr
831
        else:
832
            if isinstance(self.learning_rate, LRScheduler):
833 834
                self.learning_rate.step(self.last_epoch - self.warmup_steps)
                return self.learning_rate()
835 836 837 838

            return self.learning_rate


839
class ExponentialDecay(LRScheduler):
840
    r"""
841

842
    Update learning rate by `gamma` each epoch.
843 844

    The algorithm can be described as following.
G
guguguzi 已提交
845

846 847 848 849 850 851
    .. math::

        new\_learning\_rate = last\_learning\_rate * gamma

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
852
        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
853
            It should be in interval (0.0, 1.0).
854
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
855
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
856 857

    Returns:
858
        ``ExponentialDecay`` instance to schedule learning rate.
859 860

    Examples:
G
guguguzi 已提交
861

862 863 864 865 866
        .. code-block:: python

            import paddle
            import numpy as np

867
            # train on default dynamic graph mode
868
            linear = paddle.nn.Linear(10, 10)
869 870
            scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
871
            for epoch in range(20):
Z
Zhou Wei 已提交
872
                for batch_id in range(5):
873
                    x = paddle.uniform([10, 10])
874
                    out = linear(x)
C
chentianyu03 已提交
875
                    loss = paddle.mean(out)
876
                    loss.backward()
877 878
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
879 880
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
881

882
            # train on static graph mode
883 884 885 886
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
887 888
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
889 890
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
891
                scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
892 893 894 895 896 897
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
898
                for batch_id in range(5):
899 900 901 902 903 904
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
905
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
906 907
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
908 909 910
    """

    def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
911 912 913
        assert (
            gamma > 0.0 and gamma < 1.0
        ), " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
914
        self.gamma = gamma
915
        super().__init__(learning_rate, last_epoch, verbose)
916 917 918 919 920

    def get_lr(self):
        return self.base_lr * (self.gamma**self.last_epoch)


921
class MultiStepDecay(LRScheduler):
922
    """
923
    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
924

G
guguguzi 已提交
925
    The algorithm can be described as the code below.
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941

    .. code-block:: text

        learning_rate = 0.5
        milestones = [30, 50]
        gamma = 0.1
        if epoch < 30:
            learning_rate = 0.5
        elif epoch < 50:
            learning_rate = 0.05
        else:
            learning_rate = 0.005

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
G
guguguzi 已提交
942
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
943 944
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
945
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
946

947 948

    Returns:
949
        ``MultiStepDecay`` instance to schedule learning rate.
950 951

    Examples:
G
guguguzi 已提交
952

953 954 955 956 957
        .. code-block:: python

            import paddle
            import numpy as np

958
            # train on default dynamic graph mode
959
            linear = paddle.nn.Linear(10, 10)
960 961
            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
962
            for epoch in range(20):
Z
Zhou Wei 已提交
963
                for batch_id in range(5):
964
                    x = paddle.uniform([10, 10])
965
                    out = linear(x)
C
chentianyu03 已提交
966
                    loss = paddle.mean(out)
967
                    loss.backward()
968 969
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
970 971
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
972

973
            # train on static graph mode
974 975 976 977
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
978 979
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
980 981
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
982
                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
983 984 985 986 987 988
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
989
                for batch_id in range(5):
990 991 992 993 994 995
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
996
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
997 998
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
999 1000
    """

1001 1002 1003
    def __init__(
        self, learning_rate, milestones, gamma=0.1, last_epoch=-1, verbose=False
    ):
1004 1005 1006
        if not isinstance(milestones, (tuple, list)):
            raise TypeError(
                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
1007 1008
                % type(milestones)
            )
1009

1010 1011
        if not all(
            [
1012 1013
                milestones[i] < milestones[i + 1]
                for i in range(len(milestones) - 1)
1014 1015
            ]
        ):
1016 1017 1018 1019 1020 1021
            raise ValueError('The elements of milestones must be incremented')
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

        self.milestones = milestones
        self.gamma = gamma
1022
        super().__init__(learning_rate, last_epoch, verbose)
1023 1024 1025 1026 1027

    def get_lr(self):
        for i in range(len(self.milestones)):
            if self.last_epoch < self.milestones[i]:
                return self.base_lr * (self.gamma**i)
1028
        return self.base_lr * (self.gamma ** len(self.milestones))
1029 1030


1031
class StepDecay(LRScheduler):
1032 1033 1034
    """
    Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.

G
guguguzi 已提交
1035
    The algorithm can be described as the code below.
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049

    .. code-block:: text

        learning_rate = 0.5
        step_size = 30
        gamma = 0.1

        learning_rate = 0.5     if epoch < 30
        learning_rate = 0.05    if 30 <= epoch < 60
        learning_rate = 0.005   if 60 <= epoch < 90
        ...

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
1050
        step_size (int): the interval to update. It must be a positive integer.
G
guguguzi 已提交
1051
        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
1052 1053
            It should be less than 1.0. Default: 0.1.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1054
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1055 1056

    Returns:
1057
        ``StepDecay`` instance to schedule learning rate.
1058 1059 1060


    Examples:
G
guguguzi 已提交
1061

1062 1063 1064 1065 1066
        .. code-block:: python

            import paddle
            import numpy as np

1067
            # train on default dynamic graph mode
1068
            linear = paddle.nn.Linear(10, 10)
1069 1070
            scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1071
            for epoch in range(20):
Z
Zhou Wei 已提交
1072
                for batch_id in range(5):
1073
                    x = paddle.uniform([10, 10])
1074
                    out = linear(x)
C
chentianyu03 已提交
1075
                    loss = paddle.mean(out)
1076
                    loss.backward()
1077 1078
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1079 1080
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1081

1082
            # train on static graph mode
1083 1084 1085 1086
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1087 1088
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1089 1090
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1091
                scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
1092 1093 1094 1095 1096 1097
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1098
                for batch_id in range(5):
1099 1100 1101 1102 1103 1104
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1105
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1106 1107
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1108 1109
    """

1110 1111 1112
    def __init__(
        self, learning_rate, step_size, gamma=0.1, last_epoch=-1, verbose=False
    ):
1113 1114
        if not isinstance(step_size, int):
            raise TypeError(
1115 1116 1117
                "The type of 'step_size' must be 'int', but received %s."
                % type(step_size)
            )
1118 1119 1120
        if gamma >= 1.0:
            raise ValueError('gamma should be < 1.0.')

1121
        assert step_size > 0 and isinstance(
1122 1123
            step_size, int
        ), " 'step_size' must be a positive integer."
1124 1125
        self.step_size = step_size
        self.gamma = gamma
1126
        super().__init__(learning_rate, last_epoch, verbose)
1127 1128 1129 1130 1131 1132

    def get_lr(self):
        i = self.last_epoch // self.step_size
        return self.base_lr * (self.gamma**i)


1133
class LambdaDecay(LRScheduler):
1134 1135 1136
    """
    Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .

G
guguguzi 已提交
1137
    The algorithm can be described as the code below.
1138 1139 1140 1141 1142 1143

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95 ** epoch

1144 1145 1146
        learning_rate = 0.5        # epoch 0, 0.5*0.95**0
        learning_rate = 0.475      # epoch 1, 0.5*0.95**1
        learning_rate = 0.45125    # epoch 2, 0.5*0.95**2
1147 1148 1149 1150 1151

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1152
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
G
guguguzi 已提交
1153

1154
    Returns:
1155
        ``LambdaDecay`` instance to schedule learning rate.
1156 1157

    Examples:
G
guguguzi 已提交
1158

1159 1160 1161 1162 1163
        .. code-block:: python

            import paddle
            import numpy as np

1164
            # train on default dynamic graph mode
1165
            linear = paddle.nn.Linear(10, 10)
1166 1167
            scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1168
            for epoch in range(20):
Z
Zhou Wei 已提交
1169
                for batch_id in range(5):
1170
                    x = paddle.uniform([10, 10])
1171
                    out = linear(x)
C
chentianyu03 已提交
1172
                    loss = paddle.mean(out)
1173
                    loss.backward()
1174 1175
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1176 1177
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1178

1179
            # train on static graph mode
1180 1181 1182 1183
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1184 1185
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1186 1187
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1188
                scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
1189 1190 1191 1192 1193 1194
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1195
                for batch_id in range(5):
1196 1197 1198 1199 1200 1201
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1202
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1203 1204
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1205 1206 1207 1208 1209 1210

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
1211
                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
1212 1213
                % type(lr_lambda)
            )
1214 1215

        self.lr_lambda = lr_lambda
1216
        super().__init__(learning_rate, last_epoch, verbose)
1217 1218 1219 1220 1221

    def get_lr(self):
        return self.base_lr * self.lr_lambda(self.last_epoch)


1222
class ReduceOnPlateau(LRScheduler):
1223
    """
G
guguguzi 已提交
1224
    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
1225 1226
    by 2 to 10 times once model performance has no longer improvement.

G
guguguzi 已提交
1227 1228 1229
    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
1230 1231 1232 1233 1234 1235
    number of epochs, the learning rate will be reduced.)

    In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
G
guguguzi 已提交
1236 1237
        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
1238
            rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
G
guguguzi 已提交
1239
        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
1240
            It should be less than 1.0. Default: 0.1.
G
guguguzi 已提交
1241
        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
1242
            Default: 10.
G
guguguzi 已提交
1243
        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
1244 1245
            This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
G
guguguzi 已提交
1246
            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
1247 1248 1249
            change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
        min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
G
guguguzi 已提交
1250
        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
1251
            the update is ignored. Default: 1e-8.
1252 1253
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.

G
guguguzi 已提交
1254

1255
    Returns:
1256
        ``ReduceOnPlateau`` instance to schedule learning rate.
1257 1258 1259 1260 1261 1262 1263 1264


    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

1265
            # train on default dynamic graph mode
1266
            linear = paddle.nn.Linear(10, 10)
1267 1268
            scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1269
            for epoch in range(20):
Z
Zhou Wei 已提交
1270
                for batch_id in range(5):
1271
                    x = paddle.uniform([10, 10])
1272
                    out = linear(x)
C
chentianyu03 已提交
1273
                    loss = paddle.mean(out)
1274
                    loss.backward()
1275 1276
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1277 1278
                    scheduler.step(loss)    # If you update learning rate each step
              # scheduler.step(loss)        # If you update learning rate each epoch
1279

1280
            # train on static graph mode
1281 1282 1283 1284
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1285 1286
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1287 1288
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1289
                scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
1290 1291 1292 1293 1294 1295
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1296
                for batch_id in range(5):
1297 1298 1299 1300 1301 1302
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1303
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1304 1305
                    scheduler.step(out[0])    # If you update learning rate each step
              # scheduler.step(out[0])        # If you update learning rate each epoch
1306 1307 1308

    """

1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
    def __init__(
        self,
        learning_rate,
        mode='min',
        factor=0.1,
        patience=10,
        threshold=1e-4,
        threshold_mode='rel',
        cooldown=0,
        min_lr=0,
        epsilon=1e-8,
        verbose=False,
    ):
1322 1323 1324 1325 1326 1327 1328
        mode = mode.lower()
        if mode not in ['min', 'max']:
            raise ValueError('mode: ' + mode + ' is unknown!')
        self.mode = mode

        if factor >= 1.0:
            raise ValueError(
1329 1330
                'new_lr = origin_lr * gamma and gamma should be < 1.0.'
            )
1331 1332 1333 1334
        self.factor = factor

        threshold_mode = threshold_mode.lower()
        if threshold_mode not in ['rel', 'abs']:
1335 1336 1337
            raise ValueError(
                'threshold mode: ' + threshold_mode + ' is unknown!'
            )
1338 1339 1340
        self.threshold_mode = threshold_mode
        if not isinstance(learning_rate, (float, int)):
            raise TypeError(
1341
                "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
1342 1343
                % type(learning_rate)
            )
1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363

        self.patience = patience
        self.threshold = threshold
        self.threshold_mode = threshold_mode
        self.cooldown = cooldown
        self.min_lr = min_lr
        self.epsilon = epsilon

        self.cooldown_counter = 0
        self.best = None
        self.num_bad_epochs = 0

        # Can not call Parent __init__, so implement here.
        self.base_lr = float(learning_rate)
        self.last_lr = float(learning_rate)
        self.last_epoch = 0
        self.verbose = verbose
        self._var_name = None

    # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
1364
    def state_keys(self):
1365
        self.keys = [
1366 1367 1368 1369 1370
            'cooldown_counter',
            'best',
            'num_bad_epochs',
            'last_epoch',
            'last_lr',
1371 1372 1373 1374
        ]

    def step(self, metrics, epoch=None):
        """
G
guguguzi 已提交
1375
        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
1376 1377 1378
        The new learning rate will take effect on next epoch.

        Args:
G
guguguzi 已提交
1379
            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
1380 1381 1382 1383 1384 1385
                If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                'numpy.ndarray', its shape must be [1].
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.

        Returns:
            None
G
guguguzi 已提交
1386

1387
        Examples:
1388
            Please refer to the example of current LRScheduler.
1389 1390 1391 1392 1393 1394
        """
        if epoch is None:
            self.last_epoch = self.last_epoch + 1
        else:
            self.last_epoch = epoch

J
Jiabin Yang 已提交
1395
        if not _in_legacy_dygraph():
1396
            tmp = core.eager.Tensor
1397
        else:
1398 1399
            # need to declarate explicitly
            from paddle.framework import VarBase as Tensor
1400

1401
            tmp = Tensor
1402
        # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
1403
        if isinstance(metrics, (tmp, numpy.ndarray)):
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, (
                "the metrics.shape "
                "should be (1L,), but the current metrics.shape is {}. Maybe that "
                "you should call paddle.mean to process it first.".format(
                    metrics.shape
                )
            )
        elif not isinstance(
            metrics, (int, float, numpy.float32, numpy.float64)
        ):
1414
            raise TypeError(
1415 1416 1417 1418
                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".format(
                    type(metrics)
                )
            )
1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435

        if self.cooldown_counter > 0:
            self.cooldown_counter -= 1
        else:
            if self.best is None or self._is_better(metrics, self.best):
                self.best = metrics
                self.num_bad_epochs = 0
            else:
                self.num_bad_epochs += 1

            if self.num_bad_epochs > self.patience:
                self.cooldown_counter = self.cooldown
                self.num_bad_epochs = 0
                new_lr = max(self.last_lr * self.factor, self.min_lr)
                if self.last_lr - new_lr > self.epsilon:
                    self.last_lr = new_lr
                    if self.verbose:
1436 1437 1438 1439 1440 1441 1442
                        print(
                            'Epoch {}: {} set learning rate to {}.'.format(
                                self.last_epoch,
                                self.__class__.__name__,
                                self.last_lr,
                            )
                        )
1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457

    def _is_better(self, current, best):
        if self.mode == 'min' and self.threshold_mode == 'rel':
            return current < best - best * self.threshold

        elif self.mode == 'min' and self.threshold_mode == 'abs':
            return current < best - self.threshold

        elif self.mode == 'max' and self.threshold_mode == 'rel':
            return current > best + best * self.threshold

        else:
            return current > best + self.threshold


1458
class CosineAnnealingDecay(LRScheduler):
1459
    r"""
1460

G
guguguzi 已提交
1461 1462
    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
1463
    SGDR.
1464 1465 1466 1467

    The algorithm can be described as following.

    .. math::
1468

1469 1470
        \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
        + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
G
guguguzi 已提交
1471
        & T_{cur} \neq (2k+1)T_{max};
1472 1473 1474 1475

        \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
        \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
        & T_{cur} = (2k+1)T_{max}.
G
guguguzi 已提交
1476 1477

    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
1478
    Note that this only implements the cosine annealing part of SGDR, and not the restarts.
G
guguguzi 已提交
1479

1480 1481
    Args:
        learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
1482
        T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
1483 1484
        eta_min (float|int, optional): Minimum learning rate, that is :math:`\eta_{min}` . Default: 0.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
1485
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
1486 1487

    Returns:
1488
        ``CosineAnnealingDecay`` instance to schedule learning rate.
1489 1490

    Examples:
G
guguguzi 已提交
1491

1492 1493 1494 1495 1496
        .. code-block:: python

            import paddle
            import numpy as np

1497
            # train on default dynamic graph mode
1498
            linear = paddle.nn.Linear(10, 10)
1499 1500
            scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
1501
            for epoch in range(20):
Z
Zhou Wei 已提交
1502
                for batch_id in range(5):
1503
                    x = paddle.uniform([10, 10])
1504
                    out = linear(x)
C
chentianyu03 已提交
1505
                    loss = paddle.mean(out)
1506
                    loss.backward()
1507 1508
                    sgd.step()
                    sgd.clear_gradients()
Z
Zhou Wei 已提交
1509 1510
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1511

1512
            # train on static graph mode
1513 1514 1515 1516
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
1517 1518
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
1519 1520
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
1521
                scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
1522 1523 1524 1525 1526 1527
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(20):
Z
Zhou Wei 已提交
1528
                for batch_id in range(5):
1529 1530 1531 1532 1533 1534
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
1535
                        fetch_list=loss.name)
Z
Zhou Wei 已提交
1536 1537
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch
1538 1539
    """

1540 1541 1542
    def __init__(
        self, learning_rate, T_max, eta_min=0, last_epoch=-1, verbose=False
    ):
1543 1544
        if not isinstance(T_max, int):
            raise TypeError(
1545
                "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
1546 1547
                % type(T_max)
            )
1548 1549
        if not isinstance(eta_min, (float, int)):
            raise TypeError(
1550
                "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
1551 1552
                % type(eta_min)
            )
1553
        assert T_max > 0 and isinstance(
1554 1555
            T_max, int
        ), " 'T_max' must be a positive integer."
1556 1557
        self.T_max = T_max
        self.eta_min = float(eta_min)
1558
        super().__init__(learning_rate, last_epoch, verbose)
1559 1560 1561 1562 1563

    def get_lr(self):
        if self.last_epoch == 0:
            return self.base_lr
        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
1564 1565 1566 1567 1568 1569
            return (
                self.last_lr
                + (self.base_lr - self.eta_min)
                * (1 - math.cos(math.pi / self.T_max))
                / 2
            )
1570 1571

        return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
1572 1573
            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)
        ) * (self.last_lr - self.eta_min) + self.eta_min
1574 1575

    def _get_closed_form_lr(self):
1576 1577 1578 1579 1580 1581
        return (
            self.eta_min
            + (self.base_lr - self.eta_min)
            * (1 + math.cos(math.pi * self.last_epoch / self.T_max))
            / 2
        )
G
guguguzi 已提交
1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634


class MultiplicativeDecay(LRScheduler):
    """
    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .

    The algorithm can be described as the code below.

    .. code-block:: text

        learning_rate = 0.5        # init learning_rate
        lr_lambda = lambda epoch: 0.95

        learning_rate = 0.5        # epoch 0,
        learning_rate = 0.475      # epoch 1, 0.5*0.95
        learning_rate = 0.45125    # epoch 2, 0.475*0.95

    Args:
        learning_rate (float): The initial learning rate. It is a python float number.
        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``MultiplicativeDecay`` instance to schedule learning rate.

    Examples:

        .. code-block:: python

            import paddle

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(20):
                for batch_id in range(5):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()    # If you update learning rate each step
              # scheduler.step()        # If you update learning rate each epoch

    """

    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
        if not callable(lr_lambda):
            raise TypeError(
                "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
1635 1636
                % type(lr_lambda)
            )
G
guguguzi 已提交
1637 1638

        self.lr_lambda = lr_lambda
1639
        super().__init__(learning_rate, last_epoch, verbose)
G
guguguzi 已提交
1640 1641

    def get_lr(self):
1642 1643 1644 1645
        cur_lr = self.base_lr
        for epoch in range(1, self.last_epoch + 1):
            cur_lr = cur_lr * self.lr_lambda(epoch)
        return cur_lr
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731


class OneCycleLR(LRScheduler):
    r"""
    Sets the learning rate according to the one cycle learning rate scheduler.
    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.

    It has been proposed in `Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates <https://arxiv.org/abs/1708.07120>`_.

    Please note that the default behaviour of this scheduler follows the fastai implementation of one cycle,
    which claims that “unpublished work has shown even better results by using only two phases”.
    If you want the behaviour of this scheduler to be consistent with the paper, please set ``three_phase=True`` .

    Also note that you should update learning rate each step.

    Args:
        max_learning_rate (float): The maximum learning rate. It is a python float number.
             Functionally, it defines the initial learning rate by ``divide_factor`` .
        total_steps (int): Number of total training steps.
        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
            'linear' for linear annealing. Default: 'cos'.
        three_phase (bool, optional): Whether to use three phase.
            If ``True``:
                1. The learning rate will first increase from initial learning rate to maximum learning rate.
                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
            If ``False``:
                1. The learning rate will increase to maximum learning rate.
                2. Then it will directly decrease to minimum learning rate.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
        ``OneCycleLR`` instance to schedule learning rate.

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(5):
                for batch_id in range(20):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()        # You should update learning rate each step

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(5):
                for batch_id in range(20):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
    """

1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743
    def __init__(
        self,
        max_learning_rate,
        total_steps,
        divide_factor=25.0,
        end_learning_rate=0.0001,
        phase_pct=0.3,
        anneal_strategy='cos',
        three_phase=False,
        last_epoch=-1,
        verbose=False,
    ):
1744 1745 1746
        # Check type and value of max_learning_rate
        if not isinstance(max_learning_rate, (float, int)):
            raise TypeError(
1747 1748 1749 1750
                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
                    type(max_learning_rate)
                )
            )
1751 1752 1753 1754 1755 1756
        if max_learning_rate < 0:
            raise ValueError("'max_learning_rate' must be a positive integer.")

        # Check type and value of end_learning_rate
        if not isinstance(end_learning_rate, (float, int)):
            raise TypeError(
1757 1758 1759 1760
                "'end_learning_rate' must be 'float' or 'int', but received {}".format(
                    type(end_learning_rate)
                )
            )
1761 1762 1763 1764 1765
        if end_learning_rate < 0:
            raise ValueError("'end_learning_rate' must be a positive integer.")

        # Check type and value of total_steps
        if not isinstance(total_steps, int):
1766 1767
            raise TypeError(
                "'total_step' must be 'int', but received {}".format(
1768 1769 1770
                    type(total_steps)
                )
            )
1771 1772 1773 1774 1775 1776
        if total_steps <= 0:
            raise ValueError("'total_step' must be a positive integer.")
        self.total_steps = total_steps

        # Check type and value of pac_start
        if not isinstance(phase_pct, float):
1777 1778
            raise TypeError(
                "'phase_pct' must be 'float', but received {}".format(
1779 1780 1781
                    type(phase_pct)
                )
            )
1782 1783 1784
        if phase_pct < 0 or phase_pct > 1:
            raise ValueError(
                "'phase_pct' must be between 0 and 1, but received {}".format(
1785 1786 1787
                    phase_pct
                )
            )
1788 1789 1790 1791

        # Check type and value of divide_factor
        if not isinstance(divide_factor, (float, int)):
            raise TypeError(
1792 1793 1794 1795
                "'divide_factor' must be 'float' or 'int', but received {}".format(
                    type(divide_factor)
                )
            )
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817

        initial_lr = max_learning_rate / float(divide_factor)
        min_lr = float(end_learning_rate)

        if three_phase:
            if phase_pct >= 0.5:
                raise ValueError(
                    "When three_phase is True, 'phase_pct' must be less than 0.5"
                )
            # start step and end step of each phase.
            self._step_config = [
                0,
                phase_pct * self.total_steps - 1,
                2 * phase_pct * self.total_steps - 2,
                self.total_steps - 1,
                self.total_steps - 1,  # for the last step.
            ]
            # step size of each phase.
            self._steps_size = [
                self._step_config[1] - self._step_config[0],
                self._step_config[2] - self._step_config[1],
                self._step_config[3] - self._step_config[2],
1818 1819
                self._step_config[3]
                - self._step_config[2],  # for the last step.
1820 1821 1822
            ]
            # start lr and end lr of each phase.
            self._lr_config = [
1823 1824 1825 1826
                initial_lr,
                max_learning_rate,
                initial_lr,
                min_lr,
1827 1828 1829
            ]
        else:
            self._step_config = [
1830 1831 1832 1833
                0,
                phase_pct * self.total_steps - 1,
                self.total_steps - 1,
                self.total_steps - 1,
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848
            ]
            self._steps_size = [
                self._step_config[1] - self._step_config[0],
                self._step_config[2] - self._step_config[1],
                self._step_config[2] - self._step_config[1],
            ]
            self._lr_config = [initial_lr, max_learning_rate, min_lr]

        # Check anneal_strategy
        if anneal_strategy == 'cos':
            self.anneal_func = self._cos_annealing
        elif anneal_strategy == 'linear':
            self.anneal_func = self._linear_annealing
        else:
            raise ValueError(
1849 1850 1851 1852
                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".format(
                    anneal_strategy
                )
            )
1853
        super().__init__(initial_lr, last_epoch, verbose)
1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866

    def _cos_annealing(self, start_lr, end_lr, pct):
        cos_out = math.cos(math.pi * pct) + 1
        return end_lr + (start_lr - end_lr) / 2.0 * cos_out

    def _linear_annealing(self, start_lr, end_lr, pct):
        return (end_lr - start_lr) * pct + start_lr

    def get_lr(self):
        current_step = self.last_epoch

        if current_step > self.total_steps:
            raise ValueError(
1867 1868 1869 1870
                "Tried to step {} times. However the number of total steps is {}".format(
                    current_step, self.total_steps
                )
            )
1871

1872
        for (i, (end_step, step_size)) in enumerate(
1873 1874
            zip(self._step_config[1:], self._steps_size)
        ):
1875 1876 1877 1878
            # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
            if current_step <= end_step or i == len(self._lr_config) - 2:
                # self._step_config[i] means start step of a phase.
                percentage = (current_step - self._step_config[i]) / step_size
1879 1880 1881
                return self.anneal_func(
                    self._lr_config[i], self._lr_config[i + 1], percentage
                )
1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924


class CyclicLR(LRScheduler):
    r"""
    Set the learning rate according to the cyclic learning rate (CLR) scheduler.
    The scheduler regards the process of learning rate adjustment as one cycle after another.
    It cycles the learning rate between two boundaries with a constant frequency.
    The distance between the two boundaries can be scaled on a per-iteration or per-cycle basis.

    It has been proposed in `Cyclic Learning Rates for Training Neural Networks <https://arxiv.org/abs/1506.01186>`_.

    According to the paper, the cyclic learning rate schedule has three build-in scale methods:

    * "triangular": A basic triangular cycle without any amplitude scaling.
    * "triangular2": A basic triangular cycle that reduce initial amplitude by half each cycle.
    * "exp_range": A cycle that scales initial amplitude by scale function which is defined as :math:`gamma^{iterations}` .

    The initial amplitude is defined as max_learning_rate - base_learning_rate.
    Also note that you should update learning rate each step.

    Args:
        base_learning_rate (float): Initial learning rate, which is the lower boundary in the cycle. The paper recommends
            that set the base_learning_rate to 1/3 or 1/4 of max_learning_rate.
        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
            Since there is some scaling operation during process of learning rate adjustment,
            max_learning_rate may not actually be reached.
        step_size_up (int): Number of training steps, which is used to increase learning rate in a cycle.
            The step size of one cycle will be defined by step_size_up + step_size_down. According to the paper, step
            size should be set as at least 3 or 4 times steps in one epoch.
        step_size_down (int, optional): Number of training steps, which is used to decrease learning rate in a cycle.
            If not specified, it's value will initialize to `` step_size_up `` . Default: None
        mode (str, optional): one of 'triangular', 'triangular2' or 'exp_range'.
            If scale_fn is specified, this argument will be ignored. Default: 'triangular'
        exp_gamma (float): Constant in 'exp_range' scaling function: exp_gamma**iterations. Used only when mode = 'exp_range'. Default: 1.0
        scale_fn (function, optional): A custom scaling function, which is used to replace three build-in methods.
            It should only have one argument. For all x >= 0, 0 <= scale_fn(x) <= 1.
            If specified, then 'mode' will be ignored. Default: None
        scale_mode (str, optional): One of 'cycle' or 'iterations'. Defines whether scale_fn is evaluated on cycle
            number or cycle iterations (total iterations since start of training). Default: 'cycle'
        last_epoch (int, optional): The index of last epoch. Can be set to restart training.Default: -1, means initial learning rate.
        verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

    Returns:
1925
        ``CyclicLR`` instance to schedule learning rate.
1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974

    Examples:
        .. code-block:: python

            import paddle
            import numpy as np

            # train on default dynamic graph mode
            linear = paddle.nn.Linear(10, 10)
            scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5, max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
            for epoch in range(5):
                for batch_id in range(20):
                    x = paddle.uniform([10, 10])
                    out = linear(x)
                    loss = paddle.mean(out)
                    loss.backward()
                    sgd.step()
                    sgd.clear_gradients()
                    scheduler.step()        # You should update learning rate each step

            # train on static graph mode
            paddle.enable_static()
            main_prog = paddle.static.Program()
            start_prog = paddle.static.Program()
            with paddle.static.program_guard(main_prog, start_prog):
                x = paddle.static.data(name='x', shape=[None, 4, 5])
                y = paddle.static.data(name='y', shape=[None, 4, 5])
                z = paddle.static.nn.fc(x, 100)
                loss = paddle.mean(z)
                scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
                    max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
                sgd.minimize(loss)

            exe = paddle.static.Executor()
            exe.run(start_prog)
            for epoch in range(5):
                for batch_id in range(20):
                    out = exe.run(
                        main_prog,
                        feed={
                            'x': np.random.randn(3, 4, 5).astype('float32'),
                            'y': np.random.randn(3, 4, 5).astype('float32')
                        },
                        fetch_list=loss.name)
                    scheduler.step()    # You should update learning rate each step
    """

1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987
    def __init__(
        self,
        base_learning_rate,
        max_learning_rate,
        step_size_up,
        step_size_down=None,
        mode='triangular',
        exp_gamma=1.0,
        scale_fn=None,
        scale_mode='cycle',
        last_epoch=-1,
        verbose=False,
    ):
1988 1989 1990
        # check type and value of max_learning_rate
        if not isinstance(max_learning_rate, (float, int)):
            raise TypeError(
1991 1992 1993 1994
                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
                    type(max_learning_rate)
                )
            )
1995 1996
        if max_learning_rate < 0:
            raise ValueError(
1997 1998 1999 2000
                "'max_learning_rate' must be a positive integer, but received {}".format(
                    max_learning_rate
                )
            )
2001 2002 2003 2004

        # check type and value of step_size_up
        if not isinstance(step_size_up, int):
            raise TypeError(
2005 2006 2007 2008
                "The type of 'step_size_up' must be int, but received {}".format(
                    type(step_size_up)
                )
            )
2009 2010
        if step_size_up <= 0:
            raise ValueError(
2011 2012 2013 2014
                "'step_size_up' must be a positive integer, but received {}".format(
                    step_size_up
                )
            )
2015 2016 2017 2018 2019

        # check type and value of step_size_down
        if step_size_down is not None:
            if not isinstance(step_size_down, int):
                raise TypeError(
2020 2021 2022 2023
                    "The type of 'step_size_down' must be int, but received {}".format(
                        type(step_size_down)
                    )
                )
2024 2025
            if step_size_down <= 0:
                raise ValueError(
2026 2027 2028 2029
                    "'step_size_down' must be a positive integer, but received {}".format(
                        step_size_down
                    )
                )
2030 2031 2032 2033 2034

        # check type of exp_gamma
        if not isinstance(exp_gamma, float):
            raise TypeError(
                "The type of 'exp_gamma' must be float, but received {}".format(
2035 2036 2037
                    type(exp_gamma)
                )
            )
2038 2039

        step_size_up = float(step_size_up)
2040 2041 2042 2043 2044
        step_size_down = (
            float(step_size_down)
            if step_size_down is not None
            else step_size_up
        )
2045 2046 2047 2048 2049 2050

        self.cycle_size = step_size_up + step_size_down
        self.step_up_pct = step_size_up / self.cycle_size
        self.max_lr = float(max_learning_rate)
        self.amplitude = self.max_lr - base_learning_rate

2051 2052 2053 2054
        if (
            mode not in ['triangular', 'triangular2', 'exp_range']
            and scale_fn is None
        ):
2055 2056 2057 2058 2059
            raise ValueError(
                "'mode' is invalid and 'scale_fn' is not specified, make sure one of 'mode' or 'scale_fn' is valid"
            )
        if scale_mode not in ['cycle', 'iterations']:
            raise ValueError(
2060 2061
                "'scale_mode' must be one of 'cycle' or 'iterations"
            )
2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081

        self.mode = mode
        self.gamma = exp_gamma  # only for exp_range mode

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        super().__init__(base_learning_rate, last_epoch, verbose)

    def _triangular_scale_fn(self, x):
2082
        return 1.0
2083 2084

    def _triangular2_scale_fn(self, x):
2085
        return 1 / (2.0 ** (x - 1))
2086 2087 2088 2089 2090 2091 2092 2093

    def _exp_range_scale_fn(self, x):
        return self.gamma**x

    def get_lr(self):
        iterations = self.last_epoch

        cycle = 1 + iterations // self.cycle_size
2094
        pct_per_cycle = 1.0 + iterations / self.cycle_size - cycle
2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105

        if pct_per_cycle <= self.step_up_pct:
            scale_factor = pct_per_cycle / self.step_up_pct
        else:
            scale_factor = (1 - pct_per_cycle) / (1 - self.step_up_pct)

        base_height = self.amplitude * scale_factor

        lr = self.base_lr + base_height * self.scale_fn(eval(self.scale_mode))

        return lr