learning_rate_scheduler.py 11.2 KB
Newer Older
Q
Qiao Longfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
yuyang18 已提交
14 15 16 17 18 19 20 21
"""
When training a model, it's often useful to decay the
learning rate during training process, this is called
learning_rate_decay. There are many strategies to do
this, this module will provide some classical method.
User can also implement their own learning_rate_decay
strategy according to this module.
"""
Q
Qiao Longfei 已提交
22

23 24 25 26 27
import control_flow
import nn
import ops
import tensor
from ..initializer import init_on_cpu
W
Wu Yi 已提交
28
from ..framework import default_main_program, Parameter
Q
Qiao Longfei 已提交
29

30 31
__all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
W
Wu Yi 已提交
32
    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
33
]
Q
Qiao Longfei 已提交
34 35


36
def _decay_step_counter(begin=0):
Y
Yu Yang 已提交
37
    # the first global step is zero in learning rate decay
38
    global_step = nn.autoincreased_step_counter(
39
        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
40
    global_step = tensor.cast(global_step, 'float32')
Y
Yu Yang 已提交
41 42 43
    return global_step


44
def noam_decay(d_model, warmup_steps):
Y
yuyang18 已提交
45 46 47 48 49 50 51 52 53 54
    """
    Noam decay method. The numpy implementation of noam decay as follows.

    >>> import numpy as np
    >>> lr_value = np.power(d_model, -0.5) * np.min([
    >>>                         np.power(current_steps, -0.5),
    >>>                         np.power(warmup_steps, -1.5) * current_steps])

    Please reference `attention is all you need
    <https://arxiv.org/pdf/1706.03762.pdf>`_.
55 56 57

    Args:
        d_model(Variable): The dimensionality of input and output of model.
Y
yuyang18 已提交
58

59 60 61 62 63 64
        warmup_steps(Variable): A super parameter.

    Returns:
        The decayed learning rate.
    """
    global_step = _decay_step_counter(1)
F
fengjiayi 已提交
65 66 67 68

    a = global_step**-0.5
    b = (warmup_steps**-1.5) * global_step
    lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
69 70 71 72

    return lr_value


Y
Yu Yang 已提交
73
def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
F
fengjiayi 已提交
74 75 76 77 78 79 80 81 82 83 84
    """
    Applies exponential decay to the learning rate. 

    When training a model, it is often recommended to lower the learning rate as the 
    training progresses. By using this function, the learning rate will be decayed by 
    'decay_rate' every 'decay_steps' steps.

    >>> if staircase == True:
    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
    >>> else:
    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
Q
Qiao Longfei 已提交
85 86

    Args:
F
fengjiayi 已提交
87 88 89 90 91
        learning_rate(Variable|float): The initial learning rate.
        decay_steps(int): See the decay computation above.
        decay_rate(float): The decay rate. See the decay computation above.
        staircase(Boolean): If True, decay the learning rate at discrete intervals.
                            Default: False
Q
Qiao Longfei 已提交
92 93

    Returns:
F
fengjiayi 已提交
94
        Variable: The decayed learning rate
F
fengjiayi 已提交
95 96 97 98 99 100 101 102 103 104 105 106 107

    Examples:
        .. code-block:: python

          base_lr = 0.1
          sgd_optimizer = fluid.optimizer.SGD(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=base_lr,
                    decay_steps=10000,
                    decay_rate=0.5,
                    staircase=True))
          sgd_optimizer.minimize(avg_cost)

Q
Qiao Longfei 已提交
108
    """
Y
Yu Yang 已提交
109
    global_step = _decay_step_counter()
Q
Qiao Longfei 已提交
110

F
fengjiayi 已提交
111 112 113 114
    div_res = global_step / decay_steps
    if staircase:
        div_res = ops.floor(div_res)
    decayed_lr = learning_rate * (decay_rate**div_res)
115 116

    return decayed_lr
Q
Qiao Longfei 已提交
117 118


Y
Yu Yang 已提交
119
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
Q
Qiao Longfei 已提交
120 121
    """Applies natural exponential decay to the initial learning rate.

Y
Yu Yang 已提交
122 123 124 125 126
    >>> if not staircase:
    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
    >>> else:
    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))

Q
Qiao Longfei 已提交
127 128 129 130 131 132 133 134 135 136
    Args:
        learning_rate: A scalar float32 value or a Variable. This
          will be the initial learning rate during training
        decay_steps: A Python `int32` number.
        decay_rate: A Python `float` number.
        staircase: Boolean. If set true, decay the learning rate every decay_steps.

    Returns:
        The decayed learning rate
    """
Y
Yu Yang 已提交
137
    global_step = _decay_step_counter()
Q
Qiao Longfei 已提交
138

F
fengjiayi 已提交
139 140 141 142
    div_res = global_step / decay_steps
    if staircase:
        div_res = ops.floor(div_res)
    decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
143 144

    return decayed_lr
Q
Qiao Longfei 已提交
145 146


Y
Yu Yang 已提交
147
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
F
fengjiayi 已提交
148 149
    """
    Applies inverse time decay to the initial learning rate.
Q
Qiao Longfei 已提交
150

F
fengjiayi 已提交
151 152 153
    When training a model, it is often recommended to lower the learning rate as the 
    training progresses. By using this function, an inverse decay function will be 
    applied to the initial learning rate.
Q
Qiao Longfei 已提交
154

F
fengjiayi 已提交
155
    >>> if staircase == True:
Y
Yu Yang 已提交
156 157 158 159
    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
    >>> else:
    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)

Q
Qiao Longfei 已提交
160
    Args:
F
fengjiayi 已提交
161 162 163 164 165
        learning_rate(Variable|float): The initial learning rate.
        decay_steps(int): See the decay computation above.
        decay_rate(float): The decay rate. See the decay computation above.
        staircase(Boolean): If True, decay the learning rate at discrete intervals.
                            Default: False
Q
Qiao Longfei 已提交
166 167

    Returns:
F
fengjiayi 已提交
168
        Variable: The decayed learning rate
F
fengjiayi 已提交
169 170 171 172 173 174 175 176 177 178 179 180

    Examples:
        .. code-block:: python

          base_lr = 0.1
          sgd_optimizer = fluid.optimizer.SGD(
                learning_rate=fluid.layers.inverse_time_decay(
                    learning_rate=base_lr,
                    decay_steps=10000,
                    decay_rate=0.5,
                    staircase=True))
          sgd_optimizer.minimize(avg_cost)
Q
Qiao Longfei 已提交
181
    """
Y
Yu Yang 已提交
182
    global_step = _decay_step_counter()
Q
Qiao Longfei 已提交
183

F
fengjiayi 已提交
184 185 186
    div_res = global_step / decay_steps
    if staircase:
        div_res = ops.floor(div_res)
187

F
fengjiayi 已提交
188
    decayed_lr = learning_rate / (1 + decay_rate * div_res)
Q
Qiao Longfei 已提交
189

190
    return decayed_lr
191 192 193 194 195 196 197


def polynomial_decay(learning_rate,
                     decay_steps,
                     end_learning_rate=0.0001,
                     power=1.0,
                     cycle=False):
Q
qiaolongfei 已提交
198 199 200
    """
    Applies polynomial decay to the initial learning rate.

Q
qiaolongfei 已提交
201
    .. code-block:: python
Q
qiaolongfei 已提交
202 203 204 205 206 207 208

     if cycle:
       decay_steps = decay_steps * ceil(global_step / decay_steps)
     else:
       global_step = min(global_step, decay_steps)
       decayed_learning_rate = (learning_rate - end_learning_rate) *
            (1 - global_step / decay_steps) ^ power + end_learning_rate
209 210

    Args:
Q
qiaolongfei 已提交
211
        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
Q
update  
qiaolongfei 已提交
212
          will be the initial learning rate during training.
Q
qiaolongfei 已提交
213
        decay_steps(int32): A Python `int32` number.
Q
update  
qiaolongfei 已提交
214 215 216
        end_learning_rate(float): A Python `float` number.
        power(float): A Python `float` number.
        cycle(bool): If set true, decay the learning rate every decay_steps.
217 218

    Returns:
Q
update  
qiaolongfei 已提交
219
        Variable: The decayed learning rate
220
    """
Y
Yu Yang 已提交
221
    global_step = _decay_step_counter()
222

F
fengjiayi 已提交
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
    if cycle:
        div_res = ops.ceil(global_step / decay_steps)
        zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0)
        one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0)

        with control_flow.Switch() as switch:
            with switch.case(global_step == zero_var):
                tensor.assign(input=one_var, output=div_res)
        decay_steps = decay_steps * div_res
    else:
        decay_steps_var = tensor.fill_constant(
            shape=[1], dtype='float32', value=float(decay_steps))
        global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)

    decayed_lr = (learning_rate - end_learning_rate) * \
        ((1 - global_step / decay_steps) ** power) + end_learning_rate
239
    return decayed_lr
240 241


Y
Yu Yang 已提交
242
def piecewise_decay(boundaries, values):
243 244
    """Applies piecewise decay to the initial learning rate.

X
Xin Pan 已提交
245 246 247 248 249 250 251 252 253 254 255 256
      The algorithm can be described as the code below.

      .. code-block:: python

        boundaries = [10000, 20000]
        values = [1.0, 0.5, 0.1]
        if step < 10000:
            learning_rate = 1.0
        elif 10000 <= step < 20000:
            learning_rate = 0.5
        else:
            learning_rate = 0.1
X
Xin Pan 已提交
257 258 259 260 261 262 263 264
    Args:
        boundaries: A list of steps numbers.
        values: A list of learning rate values that will be picked during
            different step boundaries.

    Returns:
        The decayed learning rate.

X
Xin Pan 已提交
265

266 267 268 269 270
    """

    if len(values) - len(boundaries) != 1:
        raise ValueError("len(values) - len(boundaries) should be 1")

Y
Yu Yang 已提交
271
    global_step = _decay_step_counter()
272

273 274 275 276 277 278 279 280 281 282
    lr = tensor.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    with control_flow.Switch() as switch:
        for i in range(len(boundaries)):
            boundary_val = tensor.fill_constant(
283 284
                shape=[1],
                dtype='float32',
285 286 287 288 289 290 291 292 293 294
                value=float(boundaries[i]),
                force_cpu=True)
            value_var = tensor.fill_constant(
                shape=[1], dtype='float32', value=float(values[i]))
            with switch.case(global_step < boundary_val):
                tensor.assign(value_var, lr)
        last_value_var = tensor.fill_constant(
            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        with switch.default():
            tensor.assign(last_value_var, lr)
295 296

    return lr
W
Wu Yi 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328


def append_LARS(params_grads, learning_rate, weight_decay):
    """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
       each layer.

    ```python
        learning_rate *= local_gw_ratio * sqrt(sumsq(param))
                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
    ```

    Args:
        learning_rate: A learning rate Variable. This
          is the global learning rate for LARS.
        weight_decay: A Python `float` number.

    Returns:
        The decayed learning rate
    """

    def _balanced_weight(param_norm, grad_norm):
        if weight_decay == 1.0:
            return grad_norm + param_norm
        else:
            return grad_norm + weight_decay * param_norm

    for param, grad in params_grads:
        param_lr = param.optimize_attr['learning_rate']
        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
        if type(param_lr) == float and param_lr == 1.0:
            decayed_lr = learning_rate * param_norm \
329
                / _balanced_weight(param_norm, grad_norm)
W
Wu Yi 已提交
330 331
        else:
            decayed_lr = learning_rate * param_lr * param_norm \
332
                / _balanced_weight(param_norm, grad_norm)
W
Wu Yi 已提交
333 334
        # set back param local learning rate
        param.optimize_attr['learning_rate'] = decayed_lr