regularizer.py 8.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15

16
from paddle import _C_ops
17 18 19
from paddle.fluid import framework
from paddle.fluid.framework import in_dygraph_mode

20 21
__all__ = ['L1Decay', 'L2Decay']

22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43

class WeightDecayRegularizer:
    """Base class for weight decay regularizers

    Defines the common interface of weight-decay regularizers.
    Weight-decay regularizers are added only during the backward
    pass for faster regularization. They add operations to the network
    that correspond to gradient of the regularization function.
    Users should not use this class directly, but need to use one
    of its implementations
    """

    def __init__(self):
        pass

    def __call__(self, param, grad, block):
        """Add corresponding weight decay operations to the network"""
        raise NotImplementedError()

    def __str__(self):
        """Debug string"""
        raise NotImplementedError()
44 45


46
class L1Decay(WeightDecayRegularizer):
47
    r"""
48
    Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
49 50 51 52 53

    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
54 55
    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
    in Optimizer will be used.
56

L
littletomatodonkey 已提交
57
    In the implementation, the loss function of L1 Weight Decay Regularization is as follows:
58

59 60
    .. math::

L
littletomatodonkey 已提交
61
        loss = coeff * reduce\_sum(abs(x))
62 63 64

    Args:
        coeff(float, optional): regularization coeff. Default:0.0.
65

66 67
    Examples:
        .. code-block:: python
68
            :name: code-example1
69 70 71 72

            # Example1: set Regularizer in optimizer
            import paddle
            from paddle.regularizer import L1Decay
73

74
            linear = paddle.nn.Linear(10, 10)
L
littletomatodonkey 已提交
75
            inp = paddle.rand(shape=[10, 10], dtype="float32")
76 77 78 79 80 81 82 83 84 85 86 87
            out = linear(inp)
            loss = paddle.mean(out)
            beta1 = paddle.to_tensor([0.9], dtype="float32")
            beta2 = paddle.to_tensor([0.99], dtype="float32")
            momentum = paddle.optimizer.Momentum(
                learning_rate=0.1,
                parameters=linear.parameters(),
                weight_decay=L1Decay(0.0001))
            back = out.backward()
            momentum.step()
            momentum.clear_grad()

88 89 90
        .. code-block:: python
            :name: code-example2

91 92 93
            # Example2: set Regularizer in parameters
            # Set L1 regularization in parameters.
            # Global regularizer does not take effect on my_conv2d for this case.
C
cnn 已提交
94
            from paddle.nn import Conv2D
95 96 97
            from paddle import ParamAttr
            from paddle.regularizer import L2Decay

C
cnn 已提交
98
            my_conv2d = Conv2D(
99 100 101 102 103 104 105 106 107 108
                    in_channels=10,
                    out_channels=10,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
                    bias_attr=False)
    """

    def __init__(self, coeff=0.0):
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
        assert coeff is not None
        super().__init__()
        self._coeff = coeff

    def __call__(self, param, grad, block):
        """Add L1 weight decay ops to network

        Adds L1 weight decay ops.
        L1WeightDecay = reg_coeff * sign(parameter)

        Args:
            param: parameter variable for which regularization is applied
            block: block in which variable is to be created

        Returns:
            new variable for weight decay
        """
        assert isinstance(param, framework.Variable)
        assert isinstance(block, framework.Block)

129 130 131
        if in_dygraph_mode():
            sign = _C_ops.sign(param)
            return _C_ops.scale(sign, self._coeff, 0.0, True)
132 133 134 135 136 137 138
        else:
            sign = block.create_var(
                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level
            )
            decay = block.create_var(
                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level
            )
139 140 141 142
            # Append sign op
            block.append_op(
                type='sign', inputs={"X": param}, outputs={"Out": sign}
            )
143

144 145 146 147 148 149 150 151
            # Append scale op to the output of sign op
            block.append_op(
                type='scale',
                inputs={"X": sign},
                outputs={"Out": decay},
                attrs={"scale": self._coeff},
            )
            return decay
152 153 154 155 156 157

    def __str__(self):
        return "L1Decay, coeff=%f" % self._coeff


class L2Decay(WeightDecayRegularizer):
158
    r"""
159
    Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
160 161 162 163 164

    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
165 166
    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
    in Optimizer will be used.
167

L
littletomatodonkey 已提交
168
    In the implementation, the loss function of L2 Weight Decay Regularization is as follows:
169 170 171

    .. math::

L
littletomatodonkey 已提交
172
        loss = 0.5 * coeff * reduce\_sum(square(x))
173 174

    Args:
175
        coeff(float, optional): regularization coeff. Default:0.0
176

177 178
    Examples:
        .. code-block:: python
179
            :name: code-example1
180 181 182 183 184

            # Example1: set Regularizer in optimizer
            import paddle
            from paddle.regularizer import L2Decay
            linear = paddle.nn.Linear(10, 10)
L
littletomatodonkey 已提交
185
            inp = paddle.rand(shape=[10, 10], dtype="float32")
186 187 188 189 190 191 192 193 194 195 196 197
            out = linear(inp)
            loss = paddle.mean(out)
            beta1 = paddle.to_tensor([0.9], dtype="float32")
            beta2 = paddle.to_tensor([0.99], dtype="float32")
            momentum = paddle.optimizer.Momentum(
                learning_rate=0.1,
                parameters=linear.parameters(),
                weight_decay=L2Decay(0.0001))
            back = out.backward()
            momentum.step()
            momentum.clear_grad()

198 199
        .. code-block:: python
            :name: code-example2
200 201 202
            # Example2: set Regularizer in parameters
            # Set L2 regularization in parameters.
            # Global regularizer does not take effect on my_conv2d for this case.
C
cnn 已提交
203
            from paddle.nn import Conv2D
204 205 206
            from paddle import ParamAttr
            from paddle.regularizer import L2Decay

C
cnn 已提交
207
            my_conv2d = Conv2D(
208 209 210 211 212 213 214 215 216 217
                    in_channels=10,
                    out_channels=10,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
                    bias_attr=False)
    """

    def __init__(self, coeff=0.0):
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
        assert coeff is not None
        super().__init__()
        self._coeff = coeff

    def __call__(self, param, grad, block):
        """Add L2 weight decay ops to network

        Adds L2 weight decay ops.
        L2WeightDecay = reg_coeff * parameter

        Args:
            param: parameter variable for which regularization is applied
            block: block in which variable is to be created

        Returns:
            new variable for weight decay
        """
        assert isinstance(param, framework.Variable)
        assert isinstance(block, framework.Block)

238 239
        if in_dygraph_mode():
            return _C_ops.scale(param, self._coeff, 0.0, True)
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
        else:
            decay = block.create_var(
                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level
            )

            # Append Op to calculate decay
            block.append_op(
                type='scale',
                inputs={"X": param},
                outputs={"Out": decay},
                attrs={"scale": self._coeff},
            )

            return decay

    def __str__(self):
        return "L2Decay, coeff=%f" % self._coeff