optimizer.py 10.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.optimizer import Optimizer
from paddle.fluid.regularizer import L1DecayRegularizer
from paddle.fluid.regularizer import L2DecayRegularizer
from paddle.fluid import core
18
from paddle.fluid import framework
19
from paddle.fluid.framework import program_guard
20 21 22 23
from paddle.fluid import unique_name
from paddle.fluid import layers
from paddle.fluid.layer_helper import LayerHelper
import warnings
24
from paddle import _C_ops, _legacy_C_ops
25 26 27 28 29

__all__ = ['Momentum']


class Momentum(Optimizer):
30
    r"""
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55

    Simple Momentum optimizer with velocity state

    This optimizer has a flag for Nestrov Momentum.

    The update equations are as follows:

    .. math::

        & velocity = mu * velocity + gradient

        & if (use\_nesterov):

        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

        &\quad   param = param - learning\_rate * velocity

    Parameters:
        learning_rate (float|Variable): The learning rate used to update parameters. \
            Can be a float value or a Variable with one float value as data element.
        momentum (float): Momentum factor
        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
56
            The default value is None in static graph mode, at this time all parameters will be updated.
57 58 59 60 61 62
        use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
        regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
             :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \
            regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \
            ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect.  \
            Default None, meaning there is no regularization.
63 64 65
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
66
            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
67 68 69
        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
            Often choose to be ``1.0/batch_size``.
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
        name (str, optional): This parameter is used by developers to print debugging information. \
            For details, please refer to :ref:`api_guide_Name`. Default is None.

    Examples:
        .. code-block:: python

            import paddle
            import paddle.fluid as fluid
            import numpy as np

            paddle.enable_static()
            place = fluid.CPUPlace()
            main = fluid.Program()
            with fluid.program_guard(main):
                x = paddle.static.data(name='x', shape=[1, 13], dtype='float32')
                y = paddle.static.data(name='y', shape=[1], dtype='float32')
                linear = paddle.nn.Linear(13, 1)
                y_predict = linear(x)
                cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
                avg_cost = paddle.mean(cost)

                moment_optimizer = fluid.contrib.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
                moment_optimizer.minimize(avg_cost)

                fetch_list = [avg_cost]
                train_reader = paddle.batch(
                    paddle.dataset.uci_housing.train(), batch_size=1)
                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
                exe = fluid.Executor(place)
                exe.run(paddle.static.default_startup_program())
                for data in train_reader():
                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)

    """
    _velocity_acc_str = "velocity"

106 107 108 109 110 111 112 113 114 115 116 117
    def __init__(
        self,
        learning_rate,
        momentum,
        parameter_list=None,
        use_nesterov=False,
        regularization=None,
        grad_clip=None,
        multi_precision=False,
        rescale_grad=1.0,
        name=None,
    ):
118 119 120 121
        assert learning_rate is not None
        assert momentum is not None
        predicate = lambda regular: isinstance(regular, L2DecayRegularizer)
        py_regular = None if predicate(regularization) else regularization
122
        super().__init__(
123 124 125 126 127 128
            learning_rate=learning_rate,
            parameter_list=parameter_list,
            regularization=py_regular,
            grad_clip=grad_clip,
            name=name,
        )
129 130 131 132 133
        self.type = "momentum"
        self._momentum = momentum
        self._use_nesterov = bool(use_nesterov)
        self._regularization_method = ""
        self._regularization_coeff = 0
134
        if isinstance(regularization, L2DecayRegularizer):
135 136
            self._regularization_method = "l2_decay"
            self._regularization_coeff = regularization._regularization_coeff
137 138 139 140 141 142 143 144 145
        self._multi_precision = multi_precision
        self._rescale_grad = rescale_grad
        self._master_weights = {}

    def _create_master_weight(self, param):
        assert isinstance(self.helper, LayerHelper)

        var_name = param.name + "_fp32_master"
        var_name = unique_name.generate(var_name)
146
        var = paddle.static.create_global_var(
147 148 149 150 151 152
            name=var_name,
            shape=param.shape,
            value=0,
            dtype='float32',
            persistable=True,
        )
153
        block = self.helper.startup_program.global_block()
154 155 156 157 158 159 160 161 162
        block.append_op(
            type="cast",
            inputs={"X": [param]},
            outputs={"Out": [var]},
            attrs={
                "in_dtype": param.dtype,
                "out_dtype": core.VarDesc.VarType.FP32,
            },
        )
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
        self._master_weights[param.name] = var
        return var

    def _get_accumulator(self, name, param):
        """Utility function to fetch an accumulator for a parameter

        Args:
            name: name of the accumulator
            param: parameter variable for which accumulator is to be fetched

        Returns:
            accumulator variable for the parameter
        """
        if self._name is not None:
            name = self._name + "_" + name
178 179 180 181 182 183
        find_master = (
            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
        )
        target_param = (
            self._master_weights[param.name] if find_master else param
        )
184
        target_name = target_param.name
185 186 187 188
        if (
            name not in self._accumulators
            or target_name not in self._accumulators[name]
        ):
189 190
            raise Exception(
                "Accumulator {} does not exist for parameter {}".format(
191 192 193
                    name, target_name
                )
            )
194
        return self._accumulators[name][target_name]
195 196 197 198 199

    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)

        for p in parameters:
200 201 202 203
            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
                master_p = self._create_master_weight(p)
                self._add_accumulator(self._velocity_acc_str, master_p)
                continue
204 205 206 207
            if (
                p.dtype == core.VarDesc.VarType.FP16
                and not self._multi_precision
            ):
208 209 210 211
                warnings.warn(
                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
                    "Consider using multi_precision=True option of the Momentum optimizer."
                )
212 213 214 215 216
            self._add_accumulator(self._velocity_acc_str, p)

    def _append_optimize_op(self, block, param_and_grad):
        assert isinstance(block, framework.Block)

217 218 219
        velocity_acc = self._get_accumulator(
            self._velocity_acc_str, param_and_grad[0]
        )
220 221
        lr = self._create_param_lr(param_and_grad)

222 223 224 225 226 227 228 229 230
        find_master = (
            self._multi_precision
            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
        )
        master_weight = (
            self._master_weights[param_and_grad[0].name]
            if find_master
            else None
        )
231

J
Jiabin Yang 已提交
232
        if framework._non_static_mode():
233
            _, _, _ = _legacy_C_ops.momentum(
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
                param_and_grad[0],
                param_and_grad[1],
                velocity_acc,
                lr,
                master_weight,
                param_and_grad[0],
                velocity_acc,
                master_weight,
                'mu',
                self._momentum,
                'use_nesterov',
                self._use_nesterov,
                'regularization_method',
                self._regularization_method,
                'regularization_coeff',
                self._regularization_coeff,
                'multi_precision',
                find_master,
            )
253 254
            return None

255 256 257 258
        attrs = {
            "mu": self._momentum,
            "use_nesterov": self._use_nesterov,
            "regularization_method": self._regularization_method,
259 260
            "regularization_coeff": self._regularization_coeff,
            "multi_precision": find_master,
261
            "rescale_grad": self._rescale_grad,
262 263 264 265 266
        }
        inputs = {
            "Param": [param_and_grad[0]],
            "Grad": [param_and_grad[1]],
            "Velocity": [velocity_acc],
267
            "LearningRate": [lr],
268 269 270
        }
        outputs = {
            "ParamOut": [param_and_grad[0]],
271
            "VelocityOut": [velocity_acc],
272
        }
273 274 275 276 277

        if find_master:
            inputs["MasterParam"] = master_weight
            outputs["MasterParamOut"] = master_weight

278
        # create the momentum optimize op
279 280 281 282 283 284 285
        momentum_op = block.append_op(
            type=self.type,
            inputs=inputs,
            outputs=outputs,
            attrs=attrs,
            stop_gradient=True,
        )
286 287

        return momentum_op