learning_rate_scheduler.py 20.7 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
yuyang18 已提交
14 15 16 17 18 19 20 21
"""
When training a model, it's often useful to decay the
learning rate during training process, this is called
learning_rate_decay. There are many strategies to do
this, this module will provide some classical method.
User can also implement their own learning_rate_decay
strategy according to this module.
"""
Q
Qiao Longfei 已提交
22

23
import math
Q
qingqing01 已提交
24
import numbers
25

26
import paddle
27 28 29 30
from . import control_flow
from . import nn
from . import ops
from . import tensor
31
from ..framework import default_main_program, Parameter, unique_name, name_scope
Q
qingqing01 已提交
32
from ..framework import Variable
J
Jiabin Yang 已提交
33
from ..framework import _non_static_mode
M
minqiyang 已提交
34
from ..dygraph import learning_rate_scheduler as imperate_lr
35
from ..data_feeder import check_variable_and_dtype, check_type
Q
Qiao Longfei 已提交
36

37
__all__ = [
38 39 40 41 42 43 44 45
    'exponential_decay',
    'natural_exp_decay',
    'inverse_time_decay',
    'polynomial_decay',
    'piecewise_decay',
    'noam_decay',
    'cosine_decay',
    'linear_lr_warmup',
46
]
Q
Qiao Longfei 已提交
47 48


49
def _decay_step_counter(begin=0):
Y
Yu Yang 已提交
50
    # the first global step is zero in learning rate decay
51
    global_step = nn.autoincreased_step_counter(
52 53
        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1
    )
54
    global_step = tensor.cast(global_step, 'float32')
Y
Yu Yang 已提交
55 56 57
    return global_step


58
def noam_decay(d_model, warmup_steps, learning_rate=1.0):
Y
yuyang18 已提交
59
    """
S
swtkiwi 已提交
60

Y
yuyang18 已提交
61 62
    Noam decay method. The numpy implementation of noam decay as follows.

X
xiaoting 已提交
63
    .. code-block:: python
64

65
      import paddle.fluid as fluid
X
xiaoting 已提交
66 67
      import numpy as np
      # set hyper parameters
68
      base_lr = 0.01
X
xiaoting 已提交
69 70 71 72
      d_model = 2
      current_steps = 20
      warmup_steps = 200
      # compute
73
      lr_value = base_lr * np.power(d_model, -0.5) * np.min([
X
xiaoting 已提交
74 75
                              np.power(current_steps, -0.5),
                              np.power(warmup_steps, -1.5) * current_steps])
Y
yuyang18 已提交
76 77 78

    Please reference `attention is all you need
    <https://arxiv.org/pdf/1706.03762.pdf>`_.
79 80 81

    Args:
        d_model(Variable): The dimensionality of input and output of model.
Y
yuyang18 已提交
82

83 84
        warmup_steps(Variable): A super parameter.

85 86 87 88
        learning_rate(Variable|float|int): The initial learning rate. If the type
            is Variable, it's a tensor with shape [1], the data type can be
            float32 or float64. It also can be set to python int number. Default 1.0

89 90
    Returns:
        The decayed learning rate.
X
xiaoting 已提交
91 92 93
    Examples:
        .. code-block:: python

94
          import paddle.fluid as fluid
X
xiaoting 已提交
95 96 97 98
          warmup_steps = 100
          learning_rate = 0.01
          lr = fluid.layers.learning_rate_scheduler.noam_decay(
                         1/(warmup_steps *(learning_rate ** 2)),
99 100
                         warmup_steps,
                         learning_rate)
101
    """
102
    with default_main_program()._lr_schedule_guard():
J
Jiabin Yang 已提交
103
        if _non_static_mode():
104 105 106
            decay = imperate_lr.NoamDecay(
                d_model, warmup_steps, learning_rate=learning_rate
            )
M
minqiyang 已提交
107 108 109
            return decay
        else:
            global_step = _decay_step_counter(1)
F
fengjiayi 已提交
110

M
minqiyang 已提交
111 112
            a = global_step**-0.5
            b = (warmup_steps**-1.5) * global_step
113
            lr_value = learning_rate * (d_model**-0.5) * paddle.minimum(a, b)
114

M
minqiyang 已提交
115
            return lr_value
116 117


Y
Yu Yang 已提交
118
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
F
fengjiayi 已提交
119
    """
S
swtkiwi 已提交
120

121
    Applies exponential decay to the learning rate.
F
fengjiayi 已提交
122

123 124
    When training a model, it is often recommended to lower the learning rate as the
    training progresses. By using this function, the learning rate will be decayed by
F
fengjiayi 已提交
125 126
    'decay_rate' every 'decay_steps' steps.

T
tianshuo78520a 已提交
127
    Decayed learning rate calculates as follows:
K
Kaipeng Deng 已提交
128

F
fengjiayi 已提交
129 130 131 132
    >>> if staircase == True:
    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
    >>> else:
    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
Q
Qiao Longfei 已提交
133 134

    Args:
135
        learning_rate(Variable|float): The initial learning rate. It should be a Variable
K
Kaipeng Deng 已提交
136 137 138
                                       or a float
        decay_steps(int): The learning rate decay steps. See the decay computation above.
        decay_rate(float): The learning rate decay rate. See the decay computation above.
139
        staircase(bool): If True, decay the learning rate at discrete intervals, which
K
Kaipeng Deng 已提交
140 141 142
                         means the learning rate will be decayed by `decay_rate` every
                         `decay_steps`. If False, learning rate will be decayed continuously
                         and following the formula above. Default: False
Q
Qiao Longfei 已提交
143 144

    Returns:
K
Kaipeng Deng 已提交
145
        Variable: The decayed learning rate. The data type is float32.
F
fengjiayi 已提交
146 147 148 149

    Examples:
        .. code-block:: python

K
Kaipeng Deng 已提交
150
          import paddle.fluid as fluid
151 152 153
          import paddle

          paddle.enable_static()
F
fengjiayi 已提交
154 155
          base_lr = 0.1
          sgd_optimizer = fluid.optimizer.SGD(
156 157 158 159 160
              learning_rate=fluid.layers.exponential_decay(
                    learning_rate=base_lr,
                    decay_steps=10000,
                    decay_rate=0.5,
                    staircase=True))
F
fengjiayi 已提交
161

Q
Qiao Longfei 已提交
162
    """
163
    with default_main_program()._lr_schedule_guard():
J
Jiabin Yang 已提交
164
        if _non_static_mode():
165 166 167
            decay = imperate_lr.ExponentialDecay(
                learning_rate, decay_steps, decay_rate, staircase
            )
168 169 170
            return decay
        else:
            global_step = _decay_step_counter()
Q
Qiao Longfei 已提交
171

172 173 174 175
            div_res = global_step / decay_steps
            if staircase:
                div_res = ops.floor(div_res)
            decayed_lr = learning_rate * (decay_rate**div_res)
176

177
            return decayed_lr
Q
Qiao Longfei 已提交
178 179


Y
Yu Yang 已提交
180
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
S
swtkiwi 已提交
181 182
    """

183
    Applies natural exponential decay to the initial learning rate.
Q
Qiao Longfei 已提交
184

185 186 187
        When training a model, it is often recommended to lower the learning rate as the
        training progresses. By using this function, the learning rate will be decayed by
        natural exponential power 'decay_rate' every 'decay_steps' steps.
K
Kaipeng Deng 已提交
188

189
        Decayed learning rate calculates as follows:
K
Kaipeng Deng 已提交
190

191 192 193 194
        >>> if not staircase:
        >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
        >>> else:
        >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * floor(global_step / decay_steps))
Y
Yu Yang 已提交
195

196 197 198 199 200 201 202 203 204
        Args:
            learning_rate(Variable|float): The initial learning rate. It should be a Variable
                                           or a float
            decay_steps(int): The learning rate decay steps. See the decay computation above.
            decay_rate(float): The learning rate decay rate. See the decay computation above.
            staircase(bool): If True, decay the learning rate at discrete intervals, which
                             means the learning rate will be decayed by natural exponential power
                             `decay_rate` every `decay_steps`. If False, learning rate will be
                             decayed continuously and following the formula above. Default: False
Q
Qiao Longfei 已提交
205

206 207
        Returns:
            The decayed learning rate. The data type is float32.
K
Kaipeng Deng 已提交
208

209 210
        Examples:
            .. code-block:: python
K
Kaipeng Deng 已提交
211

212 213
              import paddle.fluid as fluid
              import paddle
214

215 216 217 218 219 220 221 222
              paddle.enable_static()
              base_lr = 0.1
              sgd_optimizer = fluid.optimizer.SGD(
                  learning_rate=fluid.layers.natural_exp_decay(
                        learning_rate=base_lr,
                        decay_steps=10000,
                        decay_rate=0.5,
                        staircase=True))
K
Kaipeng Deng 已提交
223

Q
Qiao Longfei 已提交
224
    """
225
    with default_main_program()._lr_schedule_guard():
J
Jiabin Yang 已提交
226
        if _non_static_mode():
227 228 229
            decay = imperate_lr.NaturalExpDecay(
                learning_rate, decay_steps, decay_rate, staircase
            )
230 231 232
            return decay
        else:
            global_step = _decay_step_counter()
Q
Qiao Longfei 已提交
233

234 235 236 237
            div_res = global_step / decay_steps
            if staircase:
                div_res = ops.floor(div_res)
            decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
238

239
            return decayed_lr
Q
Qiao Longfei 已提交
240 241


Y
Yu Yang 已提交
242
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
F
fengjiayi 已提交
243
    """
S
swtkiwi 已提交
244

F
fengjiayi 已提交
245
    Applies inverse time decay to the initial learning rate.
Q
Qiao Longfei 已提交
246

247 248
    When training a model, it is often recommended to lower the learning rate as the
    training progresses. By using this function, an inverse decay function will be
F
fengjiayi 已提交
249
    applied to the initial learning rate.
Q
Qiao Longfei 已提交
250

T
tianshuo78520a 已提交
251
    Decayed learning rate calculates as follows:
K
Kaipeng Deng 已提交
252

F
fengjiayi 已提交
253
    >>> if staircase == True:
Y
Yu Yang 已提交
254 255 256 257
    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
    >>> else:
    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)

Q
Qiao Longfei 已提交
258
    Args:
259
        learning_rate(Variable|float): The initial learning rate. It should be a Variable
K
Kaipeng Deng 已提交
260 261 262
                                       or a float
        decay_steps(int): The learning rate decay steps. See the decay computation above.
        decay_rate(float): The learning rate decay rate. See the decay computation above.
263 264 265
        staircase(bool): If True, decay the learning rate at discrete intervals, which
                         means the learning rate will be decayed by `decay_rate` times
                         every `decay_steps`. If False, learning rate will be decayed
K
Kaipeng Deng 已提交
266
                         continuously and following the formula above. Default: False
Q
Qiao Longfei 已提交
267 268

    Returns:
K
Kaipeng Deng 已提交
269
        Variable: The decayed learning rate. The data type is float32.
F
fengjiayi 已提交
270 271 272 273

    Examples:
        .. code-block:: python

K
Kaipeng Deng 已提交
274
          import paddle.fluid as fluid
275 276
          import paddle
          paddle.enable_static()
F
fengjiayi 已提交
277 278
          base_lr = 0.1
          sgd_optimizer = fluid.optimizer.SGD(
279 280 281 282 283
              learning_rate=fluid.layers.inverse_time_decay(
                    learning_rate=base_lr,
                    decay_steps=10000,
                    decay_rate=0.5,
                    staircase=True))
Q
Qiao Longfei 已提交
284
    """
285
    with default_main_program()._lr_schedule_guard():
J
Jiabin Yang 已提交
286
        if _non_static_mode():
287 288 289
            decay = imperate_lr.InverseTimeDecay(
                learning_rate, decay_steps, decay_rate, staircase
            )
290 291 292
            return decay
        else:
            global_step = _decay_step_counter()
Q
Qiao Longfei 已提交
293

294 295 296
            div_res = global_step / decay_steps
            if staircase:
                div_res = ops.floor(div_res)
297

298
            decayed_lr = learning_rate / (1 + decay_rate * div_res)
Q
Qiao Longfei 已提交
299

300
            return decayed_lr
301 302


303 304 305
def polynomial_decay(
    learning_rate, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False
):
Q
qiaolongfei 已提交
306 307 308
    """
    Applies polynomial decay to the initial learning rate.

X
xiaoting 已提交
309
    .. code-block:: text
Q
qiaolongfei 已提交
310 311 312 313 314 315 316

     if cycle:
       decay_steps = decay_steps * ceil(global_step / decay_steps)
     else:
       global_step = min(global_step, decay_steps)
       decayed_learning_rate = (learning_rate - end_learning_rate) *
            (1 - global_step / decay_steps) ^ power + end_learning_rate
317 318

    Args:
Q
qiaolongfei 已提交
319
        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
Q
update  
qiaolongfei 已提交
320
          will be the initial learning rate during training.
Q
qiaolongfei 已提交
321
        decay_steps(int32): A Python `int32` number.
Q
update  
qiaolongfei 已提交
322 323 324
        end_learning_rate(float): A Python `float` number.
        power(float): A Python `float` number.
        cycle(bool): If set true, decay the learning rate every decay_steps.
325 326

    Returns:
Q
update  
qiaolongfei 已提交
327
        Variable: The decayed learning rate
X
xiaoting 已提交
328 329 330 331 332 333 334 335 336 337 338

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          start_lr = 0.01
          total_step = 5000
          end_lr = 0
          lr = fluid.layers.polynomial_decay(
              start_lr, total_step, end_lr, power=1)

339
    """
340
    with default_main_program()._lr_schedule_guard():
J
Jiabin Yang 已提交
341
        if _non_static_mode():
342 343 344
            decay = imperate_lr.PolynomialDecay(
                learning_rate, decay_steps, end_learning_rate, power, cycle
            )
345
            return decay
346
        else:
347 348 349 350
            global_step = _decay_step_counter()

            if cycle:
                div_res = ops.ceil(global_step / decay_steps)
351 352 353 354 355 356
                zero_var = tensor.fill_constant(
                    shape=[1], dtype='float32', value=0.0
                )
                one_var = tensor.fill_constant(
                    shape=[1], dtype='float32', value=1.0
                )
357 358 359 360 361 362

                with control_flow.Switch() as switch:
                    with switch.case(global_step == zero_var):
                        tensor.assign(input=one_var, output=div_res)
                decay_steps = decay_steps * div_res
            else:
363 364 365
                decay_steps_var = tensor.fill_constant(
                    shape=[1], dtype='float32', value=float(decay_steps)
                )
366
                global_step = paddle.minimum(x=global_step, y=decay_steps_var)
367 368 369 370

            decayed_lr = (learning_rate - end_learning_rate) * (
                (1 - global_step / decay_steps) ** power
            ) + end_learning_rate
371
            return decayed_lr
372 373


Y
Yu Yang 已提交
374
def piecewise_decay(boundaries, values):
S
swtkiwi 已提交
375 376
    """

377
    Applies piecewise decay to the initial learning rate.
X
Xin Pan 已提交
378

379
        The algorithm can be described as the code below.
X
Xin Pan 已提交
380

381
        .. code-block:: text
X
Xin Pan 已提交
382

X
xiaoting 已提交
383 384
          boundaries = [10000, 20000]
          values = [1.0, 0.5, 0.1]
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
          if step < 10000:
              learning_rate = 1.0
          elif 10000 <= step < 20000:
              learning_rate = 0.5
          else:
              learning_rate = 0.1
        Args:
            boundaries: A list of steps numbers.
            values: A list of learning rate values that will be picked during
                different step boundaries.

        Returns:
            The decayed learning rate.

        Examples:
            .. code-block:: python

              import paddle.fluid as fluid
              import paddle
              paddle.enable_static()
              boundaries = [10000, 20000]
              values = [1.0, 0.5, 0.1]
              optimizer = fluid.optimizer.Momentum(
                  momentum=0.9,
                  learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values),
                  regularization=fluid.regularizer.L2Decay(1e-4))
X
xiaoting 已提交
411

X
Xin Pan 已提交
412

413
    """
414 415 416 417
    with default_main_program()._lr_schedule_guard():
        if len(values) - len(boundaries) != 1:
            raise ValueError("len(values) - len(boundaries) should be 1")

J
Jiabin Yang 已提交
418
        if _non_static_mode():
M
minqiyang 已提交
419
            decay = imperate_lr.PiecewiseDecay(boundaries, values, 0)
420 421 422
            return decay
        else:
            global_step = _decay_step_counter()
423

424 425 426 427 428 429 430
            lr = tensor.create_global_var(
                shape=[1],
                value=0.0,
                dtype='float32',
                persistable=True,
                name="learning_rate",
            )
431

432 433
            with control_flow.Switch() as switch:
                for i in range(len(boundaries)):
434 435 436 437 438 439
                    boundary_val = tensor.fill_constant(
                        shape=[1],
                        dtype='float32',
                        value=float(boundaries[i]),
                        force_cpu=True,
                    )
440
                    with switch.case(global_step < boundary_val):
441 442 443 444 445 446
                        tensor.fill_constant(
                            shape=[1],
                            dtype="float32",
                            value=float(values[i]),
                            out=lr,
                        )
447
                with switch.default():
448 449 450 451 452 453
                    tensor.fill_constant(
                        shape=[1],
                        dtype="float32",
                        value=float(values[len(values) - 1]),
                        out=lr,
                    )
454

455
            return lr
W
Wu Yi 已提交
456 457


S
shippingwang 已提交
458
def cosine_decay(learning_rate, step_each_epoch, epochs):
459
    r"""
S
swtkiwi 已提交
460

S
shippingwang 已提交
461 462
    Applies cosine decay to the learning rate.

S
shippingwang 已提交
463
    when training a model, it is often recommended to lower the learning rate as the
S
shippingwang 已提交
464 465
    training progresses. By using this function, the learning rate will be decayed by
    following cosine decay strategy.
S
shippingwang 已提交
466

467 468
    .. math::

X
xsrobin 已提交
469 470
        decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)

S
shippingwang 已提交
471 472 473 474 475
    Args:
        learning_rate(Variable|float): The initial learning rate.
        step_each_epoch(int): the number of steps in an epoch.
        epochs(int): the number of epochs.

476
    Returns:
X
xsrobin 已提交
477
        Variable: The decayed learning rate.
S
shippingwang 已提交
478

479
    Examples:
X
xsrobin 已提交
480
        .. code-block:: python
S
shippingwang 已提交
481

X
xsrobin 已提交
482 483 484 485
            import paddle.fluid as fluid
            base_lr = 0.1
            lr = fluid.layers.cosine_decay(
            learning_rate = base_lr, step_each_epoch=10000, epochs=120)
S
shippingwang 已提交
486
    """
487 488 489
    check_type(
        learning_rate, 'learning_rate', (float, tensor.Variable), 'cosine_decay'
    )
490

S
shippingwang 已提交
491
    with default_main_program()._lr_schedule_guard():
J
Jiabin Yang 已提交
492
        if _non_static_mode():
493 494 495
            decay = imperate_lr.CosineDecay(
                learning_rate, step_each_epoch, epochs
            )
M
minqiyang 已提交
496 497 498
            return decay
        else:
            global_step = _decay_step_counter()
S
shippingwang 已提交
499

M
minqiyang 已提交
500
            cur_epoch = ops.floor(global_step / step_each_epoch)
501 502 503 504 505
            decayed_lr = (
                learning_rate
                * 0.5
                * (ops.cos(cur_epoch * math.pi / epochs) + 1)
            )
M
minqiyang 已提交
506
            return decayed_lr
S
shippingwang 已提交
507 508


509 510
def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
    """
S
swtkiwi 已提交
511

512 513
    This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
514

515
    When global_step < warmup_steps, learning rate is updated as:
516

517
    .. code-block:: text
518

519 520
            linear_step = end_lr - start_lr
            lr = start_lr + linear_step * (global_step / warmup_steps)
521

522
    where start_lr is the initial learning rate, and end_lr is the final learning rate;
523

524
    When global_step >= warmup_steps, learning rate is updated as:
525

526
    .. code-block:: text
527

528
            lr = learning_rate
529

530
    where lr is the learning_rate after warm-up.
531

532
    Args:
533 534 535 536
        learning_rate (Variable|float): Learning_rate after warm-up, it could be 1D-Tensor or single value with the data type of float32.
        warmup_steps (int): Steps for warm up.
        start_lr (float): Initial learning rate of warm up.
        end_lr (float): Final learning rate of warm up.
537

538
    Returns:
539
        Variable: Warm-up learning rate with the same data type as learning_rate.
540 541


542
    Examples:
543

544
    .. code-block:: python
545

546
        import paddle.fluid as fluid
547

548 549 550 551 552 553 554 555 556
        boundaries = [100, 200]
        lr_steps = [0.1, 0.01, 0.001]
        learning_rate = fluid.layers.piecewise_decay(boundaries, lr_steps) #case1, 1D-Tensor
        #learning_rate = 0.1  #case2, single-value
        warmup_steps = 50
        start_lr = 1. / 3.
        end_lr = 0.1
        decayed_lr = fluid.layers.linear_lr_warmup(learning_rate,
            warmup_steps, start_lr, end_lr)
557

558 559 560 561 562 563 564
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        out, = exe.run(fetch_list=[decayed_lr.name])
        print(out)
        # case1: [0.33333334]
        # case2: [0.33333334]
565
    """
Q
qingqing01 已提交
566 567 568 569 570
    dtype = 'float32'
    if isinstance(learning_rate, Variable):
        dtype = learning_rate.dtype

    linear_step = float(end_lr) - float(start_lr)
571
    with default_main_program()._lr_schedule_guard():
H
hong 已提交
572

J
Jiabin Yang 已提交
573
        if _non_static_mode():
574 575 576
            lr = imperate_lr.LinearLrWarmup(
                learning_rate, warmup_steps, start_lr, end_lr
            )
H
hong 已提交
577 578
            return lr
        else:
579 580 581 582 583 584 585
            lr = tensor.create_global_var(
                shape=[1],
                value=0.0,
                dtype=dtype,
                persistable=True,
                name="learning_rate_warmup",
            )
H
hong 已提交
586 587 588 589 590

            global_step = _decay_step_counter()

            with control_flow.Switch() as switch:
                with switch.case(global_step < warmup_steps):
591 592 593
                    decayed_lr = start_lr + linear_step * (
                        global_step / float(warmup_steps)
                    )
H
hong 已提交
594 595 596 597
                    tensor.assign(decayed_lr, lr)
                with switch.default():
                    if not isinstance(learning_rate, Variable):
                        learning_rate = tensor.fill_constant(
598 599
                            shape=[1], dtype=dtype, value=float(learning_rate)
                        )
H
hong 已提交
600 601
                    tensor.assign(learning_rate, lr)
            return lr