lamb.py 14.2 KB
Newer Older
T
Thomas Young 已提交
1
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2 3 4 5 6 7 8 9 10 11 12 13 14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
from paddle import _C_ops, _legacy_C_ops
16
from paddle.fluid.executor import global_scope
17

18 19 20 21 22
from ..fluid import core, framework, layers, unique_name
from ..fluid.framework import Variable
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer

23 24
__all__ = []

25 26

class Lamb(Optimizer):
27
    r"""
28 29 30 31 32 33 34 35 36 37 38
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing
    accuracy, which supports adaptive element-wise updating and accurate layer-wise
    correction. For more information, please refer to `Large Batch Optimization for
    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .

    The updating of parameters follows:

    ..  math::

39
        m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t
40

41
        v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2
42

43
        m_t &= \frac{m_t}{\beta_1^t}
44

45
        v_t &= \frac{v_t}{\beta_2^t}
46

47
        r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}
48

49
        w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64


    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
    learning rate, :math:`\\lambda` the LAMB weight decay rate.

    Args:
        learning_rate (float|Variable, optional): the learning rate used to update parameters. \
            Can be a float value or a Variable with data type float32. Default 0.001.
        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
            Default 0.9.
        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
            Default 0.999.
        epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
        parameters (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
65 66 67 68
            This parameter is required in dygraph mode. And you can specify different options for \
            different parameter groups such as the learning rate, weight decay, etc, \
            then the parameters are list of dict. Note that the learning_rate in paramter groups \
            represents the scale of base learning_rate. \
69 70 71
            The default value is None in static mode, at this time all parameters will be updated.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
72 73 74
            ( :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` , :ref:`api_paddle_fluid_clip_ClipGradByNorm` ,
            :ref:`api_paddle_fluid_clip_ClipGradByValue` ). If you want better convergence, it is recommended
            to use :ref:`api_paddle_fluid_clip_ClipGradByGlobalNorm` . Default None, meaning there is no gradient clipping.
75 76 77 78
        name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Examples:
        .. code-block:: python
79

80
            import paddle
81 82

            inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
83 84 85 86 87 88 89 90 91
            linear = paddle.nn.Linear(10, 10)
            out = linear(inp)
            loss = paddle.mean(out)
            beta1 = paddle.to_tensor([0.9], dtype="float32")
            beta2 = paddle.to_tensor([0.85], dtype="float32")
            lamb = paddle.optimizer.Lamb(learning_rate=0.002, parameters=linear.parameters(), lamb_weight_decay=0.01)
            back = out.backward()
            lamb.step()
            lamb.clear_grad()
92

93 94 95 96 97 98
    """
    _moment1_acc_str = "moment1"
    _moment2_acc_str = "moment2"
    _beta1_pow_acc_str = "beta1_pow_acc"
    _beta2_pow_acc_str = "beta2_pow_acc"

99 100 101 102 103 104 105 106 107 108 109 110 111
    def __init__(
        self,
        learning_rate=0.001,
        lamb_weight_decay=0.01,
        beta1=0.9,
        beta2=0.999,
        epsilon=1e-6,
        parameters=None,
        grad_clip=None,
        exclude_from_weight_decay_fn=None,
        multi_precision=False,
        name=None,
    ):
112 113 114 115
        assert learning_rate is not None
        assert beta1 is not None
        assert beta2 is not None
        assert epsilon is not None
116
        super().__init__(
117 118 119 120 121 122
            learning_rate=learning_rate,
            parameters=parameters,
            weight_decay=None,
            grad_clip=grad_clip,
            name=name,
        )
123 124 125 126 127
        self.type = "lamb"
        self._beta1 = beta1
        self._beta2 = beta2
        self._epsilon = epsilon
        self._lamb_weight_decay = lamb_weight_decay
128
        self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
129 130 131 132 133 134 135
        self._default_dict = {
            'beta1': beta1,
            'beta2': beta2,
            'epsilon': epsilon,
            'lamb_weight_decay': lamb_weight_decay,
            'exclude_from_weight_decay_fn': exclude_from_weight_decay_fn,
        }
136
        self._master_weights = {}
137
        self._used_master_weights = {}
138
        # TODO(zengjinle): expose API as soon as possible
139
        self._multi_precision = multi_precision
140

141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
    def _get_parameter(self, name, scope=None):
        if scope is None:
            scope = global_scope()

        p_t = scope.find_var(name).get_tensor()

        master_name = self._used_master_weights.get(name)
        if master_name is not None:
            master_p_t = scope.find_var(master_name).get_tensor()
            assert master_p_t._dtype() != p_t._dtype()
            assert master_p_t.shape() == p_t.shape()
        else:
            master_p_t = None
        return p_t, master_p_t

156 157 158 159 160 161 162 163 164
    def _create_master_weight(self, param):
        assert self._multi_precision
        if param.name in self._master_weights:
            var = self._master_weights[param.name]
        else:
            assert isinstance(self.helper, LayerHelper)

            var_name = param.name + "_fp32_master"
            var_name = unique_name.generate(var_name)
165 166 167 168 169 170 171
            var = layers.create_global_var(
                name=var_name,
                shape=param.shape,
                value=0,
                dtype='float32',
                persistable=True,
            )
172
            block = self.helper.startup_program.global_block()
173 174 175 176 177 178 179 180 181
            block.append_op(
                type="cast",
                inputs={"X": [param]},
                outputs={"Out": [var]},
                attrs={
                    "in_dtype": param.dtype,
                    "out_dtype": core.VarDesc.VarType.FP32,
                },
            )
182 183
            self._master_weights[param.name] = var
        return var
184 185 186

    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)
187 188
        if isinstance(parameters, dict):
            parameters = self._update_param_group(parameters)
189 190 191

        # Create accumulator tensors for first and second moments
        for p in parameters:
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
                master_p = self._create_master_weight(p)
                self._add_moments_pows(master_p)
            else:
                self._add_moments_pows(p)

    def _get_accumulator(self, name, param):
        """Utility function to fetch an accumulator for a parameter
        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched
        Returns:
            accumulator variable for the parameter
        """
        if self._name is not None:
            name = self._name + "_" + name
208 209 210 211 212 213
        find_master = (
            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
        )
        target_param = (
            self._master_weights[param.name] if find_master else param
        )
214
        target_name = target_param.name
215 216 217 218
        if (
            name not in self._accumulators
            or target_name not in self._accumulators[name]
        ):
219 220
            raise Exception(
                "Accumulator {} does not exist for parameter {}".format(
221 222 223
                    name, target_name
                )
            )
224 225 226 227 228 229 230 231 232 233
        return self._accumulators[name][target_name]

    def _add_moments_pows(self, p):
        acc_dtype = p.dtype
        if acc_dtype == core.VarDesc.VarType.FP16:
            acc_dtype = core.VarDesc.VarType.FP32

        self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
        self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
        self._add_accumulator(
234 235 236 237 238 239 240 241 242 243
            name=self._beta1_pow_acc_str,
            param=p,
            dtype=acc_dtype,
            fill_value=0.9
            if isinstance(self._beta1, Variable)
            else self._beta1,
            shape=[1],
            type=core.VarDesc.VarType.LOD_TENSOR,
            device='cpu',
        )
244
        self._add_accumulator(
245 246 247 248 249 250 251 252 253 254
            name=self._beta2_pow_acc_str,
            param=p,
            dtype=acc_dtype,
            fill_value=0.999
            if isinstance(self._beta2, Variable)
            else self._beta2,
            shape=[1],
            type=core.VarDesc.VarType.LOD_TENSOR,
            device='cpu',
        )
255 256 257

    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)
258 259 260
        if isinstance(param_and_grad, dict):
            param_and_grad = self._update_param_group(param_and_grad)

261 262
        block.program._use_lamb = True

263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
        moment1 = self._get_accumulator(
            self._moment1_acc_str, param_and_grad[0]
        )
        moment2 = self._get_accumulator(
            self._moment2_acc_str, param_and_grad[0]
        )
        beta1_pow_acc = self._get_accumulator(
            self._beta1_pow_acc_str, param_and_grad[0]
        )
        beta2_pow_acc = self._get_accumulator(
            self._beta2_pow_acc_str, param_and_grad[0]
        )

        if (
            self._exclude_from_weight_decay_fn is not None
            and self._exclude_from_weight_decay_fn(param_and_grad[0])
        ):
280 281 282
            weight_decay = 0.0
        else:
            weight_decay = self._lamb_weight_decay
283 284
        lr = self._create_param_lr(param_and_grad)

285 286 287 288
        find_master = (
            self._multi_precision
            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
        )
289 290 291 292 293 294
        p_name = param_and_grad[0].name
        if find_master:
            master_weight = self._master_weights[p_name]
            self._used_master_weights[p_name] = master_weight.name
        else:
            master_weight = None
295 296
        found_inf = self._get_auxiliary_var('found_inf')

T
Thomas Young 已提交
297
        if framework.in_dygraph_mode():
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
            _C_ops.lamb_(
                param_and_grad[0],
                param_and_grad[1],
                lr,
                moment1,
                moment2,
                beta1_pow_acc,
                beta2_pow_acc,
                master_weight,
                found_inf,
                weight_decay,
                self._beta1,
                self._beta2,
                self._epsilon,
                find_master,
            )
T
Thomas Young 已提交
314
            return None
J
Jiabin Yang 已提交
315
        if framework._non_static_mode():
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
            _legacy_C_ops.lamb(
                param_and_grad[0],
                param_and_grad[1],
                lr,
                moment1,
                moment2,
                beta1_pow_acc,
                beta2_pow_acc,
                master_weight,
                param_and_grad[0],
                moment1,
                moment2,
                beta1_pow_acc,
                beta2_pow_acc,
                master_weight,
                'beta1',
                self._beta1,
                'beta2',
                self._beta2,
                'epsilon',
                self._epsilon,
                'weight_decay',
                weight_decay,
                'multi_precision',
                find_master,
            )
342
            return None
343 344

        # create the lamb optimize op
345 346 347 348 349 350 351
        inputs = {
            "Param": param_and_grad[0],
            "Grad": param_and_grad[1],
            "LearningRate": lr,
            "Moment1": moment1,
            "Moment2": moment2,
            "Beta1Pow": beta1_pow_acc,
352
            "Beta2Pow": beta2_pow_acc,
353 354 355 356 357 358
        }
        outputs = {
            "ParamOut": param_and_grad[0],
            "Moment1Out": moment1,
            "Moment2Out": moment2,
            "Beta1PowOut": beta1_pow_acc,
359
            "Beta2PowOut": beta2_pow_acc,
360 361 362 363 364
        }
        attrs = {
            "beta1": self._beta1,
            "beta2": self._beta2,
            "epsilon": self._epsilon,
365 366
            "weight_decay": weight_decay,
            "multi_precision": find_master,
367 368
        }

369 370 371 372 373 374 375
        if find_master:
            inputs["MasterParam"] = master_weight
            outputs["MasterParamOut"] = master_weight

        if found_inf:
            inputs["SkipUpdate"] = found_inf

376 377 378 379 380 381 382
        lamb_op = block.append_op(
            type=self.type,
            inputs=inputs,
            outputs=outputs,
            attrs=attrs,
            stop_gradient=True,
        )
383 384

        return lamb_op
385 386 387 388 389 390

    def _update_param_group(self, parameters):
        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
        self._lamb_weight_decay = parameters.get(
391 392
            'lamb_weight_decay', self._default_dict['lamb_weight_decay']
        )
393 394
        self._exclude_from_weight_decay_fn = parameters.get(
            'exclude_from_weight_decay_fn',
395 396
            self._default_dict['exclude_from_weight_decay_fn'],
        )
397 398
        parameters = parameters.get('params')
        return parameters