nn.py 16.4 KB
Newer Older
M
minqiyang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import paddle
M
minqiyang 已提交
16 17
from .. import core
from ..layers import utils
18
from ..layers import nn as F
19
from .. import dygraph_utils
M
minqiyang 已提交
20
from . import layers
21 22 23 24 25 26 27 28 29 30 31 32
from ..framework import (
    Variable,
    _non_static_mode,
    OpProtoHolder,
    Parameter,
    _dygraph_tracer,
    _varbase_creator,
    default_main_program,
    _global_flags,
    in_dygraph_mode,
    _in_legacy_dygraph,
)
33

34 35 36 37 38 39
from ..data_feeder import (
    convert_dtype,
    check_variable_and_dtype,
    check_type,
    check_dtype,
)
40

M
minqiyang 已提交
41
from ..param_attr import ParamAttr
42
from ..initializer import Normal, Constant, NumpyArrayInitializer
H
hong 已提交
43 44
from .. import unique_name
from .layer_object_helper import LayerObjectHelper
45
from ..data_feeder import check_variable_and_dtype, check_type
L
lujun 已提交
46
import numpy as np
47
import numbers
48
import logging
49
import os
50
import paddle.utils.deprecated as deprecated
51
from paddle import _C_ops, _legacy_C_ops
52

53
__all__ = []
M
minqiyang 已提交
54 55


M
minqiyang 已提交
56
class BatchNorm(layers.Layer):
57
    r"""
58

59 60
    This interface is used to construct a callable object of the ``BatchNorm`` class.
    For more details, refer to code examples.
61
    It implements the function of the Batch Normalization Layer and can be used
62 63
    as a normalizer function for conv2d and fully connected operations.
    The data is normalized by the mean and variance of the channel based on the current batch data.
64 65 66 67
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

68
    When use_global_stats = False, the :math:`\mu_{\beta}`
69
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
70
    Calculated as follows:
71 72 73

    ..  math::

74 75 76 77
        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
        //\ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
        //\ mini-batch\ variance \\
78

79 80
    - :math:`x` : mini-batch data
    - :math:`m` : the size of the mini-batch data
81 82 83

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
84 85 86 87 88 89
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
90

91
    The normalization function formula is as follows:
92

93 94
    ..  math::

95 96 97 98
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

99

100 101 102
    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter
103

104
    Parameters:
105
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
T
tianshuo78520a 已提交
106
        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
107 108 109
        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
             Default: False.
110 111 112
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
113 114 115
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
116
        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
117 118 119
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
120 121
        dtype(str, optional): Indicate the data type of the input ``Tensor``,
             which can be float32 or float64. Default: float32.
学渣戊's avatar
学渣戊 已提交
122
        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
123 124 125
        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
126 127
        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
            average when model average is enabled. Default: True.
128
        use_global_stats(bool, optional): Whether to use global mean and
129 130 131
            variance. In inference or test mode, set use_global_stats to true
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
132 133 134 135
            and variance are also used during train period. Default: False.
        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
            Default: False.
136 137

    Returns:
138
        None
139 140 141

    Examples:
        .. code-block:: python
L
lujun 已提交
142

学渣戊's avatar
学渣戊 已提交
143
          import paddle
L
lujun 已提交
144
          import paddle.fluid as fluid
145
          from paddle.fluid.dygraph.base import to_variable
L
lujun 已提交
146

学渣戊's avatar
学渣戊 已提交
147
          x = paddle.rand([3, 10, 3, 7], 'float32')
L
lujun 已提交
148
          with fluid.dygraph.guard():
149
              x = to_variable(x)
150
              batch_norm = fluid.BatchNorm(10)
151
              hidden1 = batch_norm(x)
152 153
    """

154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
    def __init__(
        self,
        num_channels,
        act=None,
        is_test=False,
        momentum=0.9,
        epsilon=1e-05,
        param_attr=None,
        bias_attr=None,
        dtype='float32',
        data_layout='NCHW',
        in_place=False,
        moving_mean_name=None,
        moving_variance_name=None,
        do_model_average_for_mean_and_var=True,
        use_global_stats=False,
        trainable_statistics=False,
    ):
172
        super().__init__()
173
        self._param_attr = param_attr
174
        self._bias_attr = bias_attr
175
        self._act = act
176
        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
M
minqiyang 已提交
177

178 179 180
        assert (
            bias_attr is not False
        ), "bias_attr should not be False in batch_norm."
M
minqiyang 已提交
181

182 183
        if dtype == "float16":
            self._dtype = "float32"
M
minqiyang 已提交
184 185 186 187 188 189
        else:
            self._dtype = dtype

        param_shape = [num_channels]

        # create parameter
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            default_initializer=Constant(1.0),
        )
        self.weight.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self.bias = self.create_parameter(
            attr=self._bias_attr,
            shape=param_shape,
            dtype=self._dtype,
            is_bias=True,
        )
        self.bias.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self._mean = self.create_parameter(
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
220
        self._mean.stop_gradient = True
M
minqiyang 已提交
221

222 223 224 225 226 227 228 229 230 231
        self._variance = self.create_parameter(
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
232
        self._variance.stop_gradient = True
M
minqiyang 已提交
233 234

        self._in_place = in_place
235
        self._data_layout = data_layout
M
minqiyang 已提交
236 237 238
        self._momentum = momentum
        self._epsilon = epsilon
        self._is_test = is_test
239
        self._fuse_with_relu = False
M
minqiyang 已提交
240
        self._use_global_stats = use_global_stats
241
        self._trainable_statistics = trainable_statistics
M
minqiyang 已提交
242 243 244 245 246 247 248

    def forward(self, input):
        # create output
        # mean and mean_out share the same memory
        mean_out = self._mean
        # variance and variance out share the same memory
        variance_out = self._variance
249

J
Jiabin Yang 已提交
250
        if _non_static_mode():
H
hong 已提交
251
            if in_dygraph_mode():
252
                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
253 254 255
                    input,
                    self._mean,
                    self._variance,
256 257 258
                    self.weight,
                    self.bias,
                    not self.training,
259 260 261 262 263 264
                    self._momentum,
                    self._epsilon,
                    self._data_layout,
                    self._use_global_stats,
                    self._trainable_statistics,
                )
265
                return dygraph_utils._append_activation_in_dygraph(
266 267
                    batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
                )
268 269

            elif _in_legacy_dygraph():
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
                attrs = (
                    "momentum",
                    self._momentum,
                    "epsilon",
                    self._epsilon,
                    "is_test",
                    not self.training,
                    "data_layout",
                    self._data_layout,
                    "use_mkldnn",
                    self._use_mkldnn,
                    "fuse_with_relu",
                    self._fuse_with_relu,
                    "use_global_stats",
                    self._use_global_stats,
                    'trainable_statistics',
                    self._trainable_statistics,
                )
288
                batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
289 290 291 292 293 294 295 296 297 298
                    input,
                    self.weight,
                    self.bias,
                    self._mean,
                    self._variance,
                    None,
                    mean_out,
                    variance_out,
                    *attrs
                )
299

300
            return dygraph_utils._append_activation_in_dygraph(
301 302
                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
            )
303

304 305 306
        check_variable_and_dtype(
            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
        )
307

308 309 310 311 312 313 314
        attrs = {
            "momentum": self._momentum,
            "epsilon": self._epsilon,
            "is_test": self._is_test,
            "data_layout": self._data_layout,
            "use_mkldnn": False,
            "fuse_with_relu": self._fuse_with_relu,
315 316
            "use_global_stats": self._use_global_stats,
            "trainable_statistics": self._trainable_statistics,
317
        }
M
minqiyang 已提交
318

319 320 321 322 323
        inputs = {
            "X": [input],
            "Scale": [self.weight],
            "Bias": [self.bias],
            "Mean": [self._mean],
324
            "Variance": [self._variance],
325 326
        }

327
        saved_mean = self._helper.create_variable_for_type_inference(
328 329
            dtype=self._dtype, stop_gradient=True
        )
330
        saved_variance = self._helper.create_variable_for_type_inference(
331 332
            dtype=self._dtype, stop_gradient=True
        )
333
        reserve_space = self._helper.create_variable_for_type_inference(
334 335
            dtype=self._helper.input_dtype(input), stop_gradient=True
        )
336

337 338 339 340 341
        batch_norm_out = (
            input
            if self._in_place
            else self._helper.create_variable_for_type_inference(self._dtype)
        )
342 343 344 345 346 347

        outputs = {
            "Y": [batch_norm_out],
            "MeanOut": [mean_out],
            "VarianceOut": [variance_out],
            "SavedMean": [saved_mean],
348
            "SavedVariance": [saved_variance],
349
        }
350
        if reserve_space is not None:
351
            outputs["ReserveSpace"] = [reserve_space]
352

353 354 355
        self._helper.append_op(
            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
        )
M
minqiyang 已提交
356

L
lujun 已提交
357
        # Currently, we don't support inplace in dygraph mode
358
        return self._helper.append_activation(batch_norm_out, self._act)
359 360


L
lujun 已提交
361
class RowConv(layers.Layer):
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
    """
    ***Row-convolution operator***

    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf

    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
    forward and a backward pass through the entire sequence. However, unlike
    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
    and low-latency setting. The lookahead convolution incorporates information
    from future subsequences in a computationally efficient manner to improve
    unidirectional recurrent neural networks. The row convolution operator is
    different from the 1D sequence convolution, and is computed as follows:

    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.

    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .

380
    Parameters:
L
lujun 已提交
381
        name_scope(str): The name of this class.
382 383 384
        future_context_size (int): Future context size. Please note, the shape
            of convolution kernel is [future_context_size + 1, D].
        param_attr (ParamAttr): Attributes of parameters, including
L
lujun 已提交
385 386
            name, initializer etc. Default: None.
        act (str): Non-linear activation to be applied to output variable. Default: None.
387

388 389 390
    Attributes:
        weight (Parameter): the learnable weights of this layer.

391
    Returns:
L
lujun 已提交
392 393
        the output(Out) is a LodTensor, which supports variable time-length input sequences.
        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
394 395 396 397 398 399 400 401 402 403 404 405 406 407 408

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              x = numpy.random.random((16)).astype('float32')
              rowConv = fluid.dygraph.nn.RowConv(
                    'RowConv', future_context_size=2)
              ret = rowConv(fluid.dygraph.base.to_variable(x))

    """

409 410 411 412 413
    def __init__(
        self, name_scope, future_context_size, param_attr=None, act=None
    ):
        assert (
            not _non_static_mode()
414
        ), "RowConv is not supported by dynamic graph mode yet!"
415
        super().__init__(name_scope)
L
lujun 已提交
416 417 418 419
        self._act = act
        self._param_attr = param_attr
        self._future_context_size = future_context_size

420
    def _build_once(self, input):
L
lujun 已提交
421 422
        self._dtype = self._helper.input_dtype(input)
        filter_shape = [self._future_context_size + 1, input.shape[1]]
423 424 425 426 427 428
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            is_bias=False,
        )
L
lujun 已提交
429 430 431

    def forward(self, input):
        out = self._helper.create_variable_for_type_inference(self._dtype)
432 433 434 435 436
        self._helper.append_op(
            type='row_conv',
            inputs={'X': [input], 'Filter': [self.weight]},
            outputs={'Out': [out]},
        )
L
lujun 已提交
437
        return self._helper.append_activation(out, act=self._act)