nn.py 16.3 KB
Newer Older
M
minqiyang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import paddle
M
minqiyang 已提交
16 17
from .. import core
from ..layers import utils
18
from ..layers import nn as F
19
from .. import dygraph_utils
M
minqiyang 已提交
20
from . import layers
21 22 23 24 25 26 27 28 29 30 31 32
from ..framework import (
    Variable,
    _non_static_mode,
    OpProtoHolder,
    Parameter,
    _dygraph_tracer,
    _varbase_creator,
    default_main_program,
    _global_flags,
    in_dygraph_mode,
    _in_legacy_dygraph,
)
33

34 35 36 37 38 39
from ..data_feeder import (
    convert_dtype,
    check_variable_and_dtype,
    check_type,
    check_dtype,
)
M
minqiyang 已提交
40
from ..param_attr import ParamAttr
41
from ..initializer import Normal, Constant, NumpyArrayInitializer
H
hong 已提交
42 43
from .. import unique_name
from .layer_object_helper import LayerObjectHelper
44
from ..data_feeder import check_variable_and_dtype, check_type
L
lujun 已提交
45
import numpy as np
46
import numbers
47
import logging
48
import os
49
import paddle.utils.deprecated as deprecated
50
from paddle import _C_ops, _legacy_C_ops
51

52
__all__ = []
M
minqiyang 已提交
53 54


M
minqiyang 已提交
55
class BatchNorm(layers.Layer):
56
    r"""
57

58 59
    This interface is used to construct a callable object of the ``BatchNorm`` class.
    For more details, refer to code examples.
60
    It implements the function of the Batch Normalization Layer and can be used
61 62
    as a normalizer function for conv2d and fully connected operations.
    The data is normalized by the mean and variance of the channel based on the current batch data.
63 64 65 66
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

67
    When use_global_stats = False, the :math:`\mu_{\beta}`
68
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
69
    Calculated as follows:
70 71 72

    ..  math::

73 74 75 76
        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
        //\ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
        //\ mini-batch\ variance \\
77

78 79
    - :math:`x` : mini-batch data
    - :math:`m` : the size of the mini-batch data
80 81 82

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
83 84 85 86 87 88
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
89

90
    The normalization function formula is as follows:
91

92 93
    ..  math::

94 95 96 97
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

98

99 100 101
    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter
102

103
    Parameters:
104
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
T
tianshuo78520a 已提交
105
        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
106 107 108
        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
             Default: False.
109 110 111
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
112 113 114
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
115
        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
116 117 118
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
119 120 121 122 123 124
        dtype(str, optional): Indicate the data type of the input ``Tensor``,
             which can be float32 or float64. Default: float32.
        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
125 126
        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
            average when model average is enabled. Default: True.
127
        use_global_stats(bool, optional): Whether to use global mean and
128 129 130
            variance. In inference or test mode, set use_global_stats to true
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
131 132 133 134
            and variance are also used during train period. Default: False.
        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
            Default: False.
135 136

    Returns:
137
        None
138 139 140

    Examples:
        .. code-block:: python
L
lujun 已提交
141 142

          import paddle.fluid as fluid
143 144
          from paddle.fluid.dygraph.base import to_variable
          import numpy as np
L
lujun 已提交
145

146
          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
L
lujun 已提交
147
          with fluid.dygraph.guard():
148
              x = to_variable(x)
149
              batch_norm = fluid.BatchNorm(10)
150
              hidden1 = batch_norm(x)
151 152
    """

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
    def __init__(
        self,
        num_channels,
        act=None,
        is_test=False,
        momentum=0.9,
        epsilon=1e-05,
        param_attr=None,
        bias_attr=None,
        dtype='float32',
        data_layout='NCHW',
        in_place=False,
        moving_mean_name=None,
        moving_variance_name=None,
        do_model_average_for_mean_and_var=True,
        use_global_stats=False,
        trainable_statistics=False,
    ):
171
        super().__init__()
172
        self._param_attr = param_attr
173
        self._bias_attr = bias_attr
174
        self._act = act
175
        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
M
minqiyang 已提交
176

177 178 179
        assert (
            bias_attr is not False
        ), "bias_attr should not be False in batch_norm."
M
minqiyang 已提交
180

181 182
        if dtype == "float16":
            self._dtype = "float32"
M
minqiyang 已提交
183 184 185 186 187 188
        else:
            self._dtype = dtype

        param_shape = [num_channels]

        # create parameter
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            default_initializer=Constant(1.0),
        )
        self.weight.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self.bias = self.create_parameter(
            attr=self._bias_attr,
            shape=param_shape,
            dtype=self._dtype,
            is_bias=True,
        )
        self.bias.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self._mean = self.create_parameter(
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
219
        self._mean.stop_gradient = True
M
minqiyang 已提交
220

221 222 223 224 225 226 227 228 229 230
        self._variance = self.create_parameter(
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
231
        self._variance.stop_gradient = True
M
minqiyang 已提交
232 233

        self._in_place = in_place
234
        self._data_layout = data_layout
M
minqiyang 已提交
235 236 237
        self._momentum = momentum
        self._epsilon = epsilon
        self._is_test = is_test
238
        self._fuse_with_relu = False
M
minqiyang 已提交
239
        self._use_global_stats = use_global_stats
240
        self._trainable_statistics = trainable_statistics
M
minqiyang 已提交
241 242 243 244 245 246 247

    def forward(self, input):
        # create output
        # mean and mean_out share the same memory
        mean_out = self._mean
        # variance and variance out share the same memory
        variance_out = self._variance
248

J
Jiabin Yang 已提交
249
        if _non_static_mode():
H
hong 已提交
250
            if in_dygraph_mode():
251
                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
252 253 254
                    input,
                    self._mean,
                    self._variance,
255 256 257
                    self.weight,
                    self.bias,
                    not self.training,
258 259 260 261 262 263
                    self._momentum,
                    self._epsilon,
                    self._data_layout,
                    self._use_global_stats,
                    self._trainable_statistics,
                )
264
                return dygraph_utils._append_activation_in_dygraph(
265 266
                    batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
                )
267 268

            elif _in_legacy_dygraph():
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
                attrs = (
                    "momentum",
                    self._momentum,
                    "epsilon",
                    self._epsilon,
                    "is_test",
                    not self.training,
                    "data_layout",
                    self._data_layout,
                    "use_mkldnn",
                    self._use_mkldnn,
                    "fuse_with_relu",
                    self._fuse_with_relu,
                    "use_global_stats",
                    self._use_global_stats,
                    'trainable_statistics',
                    self._trainable_statistics,
                )
287
                batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
288 289 290 291 292 293 294 295 296 297
                    input,
                    self.weight,
                    self.bias,
                    self._mean,
                    self._variance,
                    None,
                    mean_out,
                    variance_out,
                    *attrs
                )
298

299
            return dygraph_utils._append_activation_in_dygraph(
300 301
                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
            )
302

303 304 305
        check_variable_and_dtype(
            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
        )
306

307 308 309 310 311 312 313
        attrs = {
            "momentum": self._momentum,
            "epsilon": self._epsilon,
            "is_test": self._is_test,
            "data_layout": self._data_layout,
            "use_mkldnn": False,
            "fuse_with_relu": self._fuse_with_relu,
314 315
            "use_global_stats": self._use_global_stats,
            "trainable_statistics": self._trainable_statistics,
316
        }
M
minqiyang 已提交
317

318 319 320 321 322
        inputs = {
            "X": [input],
            "Scale": [self.weight],
            "Bias": [self.bias],
            "Mean": [self._mean],
323
            "Variance": [self._variance],
324 325
        }

326
        saved_mean = self._helper.create_variable_for_type_inference(
327 328
            dtype=self._dtype, stop_gradient=True
        )
329
        saved_variance = self._helper.create_variable_for_type_inference(
330 331
            dtype=self._dtype, stop_gradient=True
        )
332
        reserve_space = self._helper.create_variable_for_type_inference(
333 334
            dtype=self._helper.input_dtype(input), stop_gradient=True
        )
335

336 337 338 339 340
        batch_norm_out = (
            input
            if self._in_place
            else self._helper.create_variable_for_type_inference(self._dtype)
        )
341 342 343 344 345 346

        outputs = {
            "Y": [batch_norm_out],
            "MeanOut": [mean_out],
            "VarianceOut": [variance_out],
            "SavedMean": [saved_mean],
347
            "SavedVariance": [saved_variance],
348
        }
349
        if reserve_space is not None:
350
            outputs["ReserveSpace"] = [reserve_space]
351

352 353 354
        self._helper.append_op(
            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
        )
M
minqiyang 已提交
355

L
lujun 已提交
356
        # Currently, we don't support inplace in dygraph mode
357
        return self._helper.append_activation(batch_norm_out, self._act)
358 359


L
lujun 已提交
360
class RowConv(layers.Layer):
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
    """
    ***Row-convolution operator***

    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf

    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
    forward and a backward pass through the entire sequence. However, unlike
    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
    and low-latency setting. The lookahead convolution incorporates information
    from future subsequences in a computationally efficient manner to improve
    unidirectional recurrent neural networks. The row convolution operator is
    different from the 1D sequence convolution, and is computed as follows:

    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.

    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .

379
    Parameters:
L
lujun 已提交
380
        name_scope(str): The name of this class.
381 382 383
        future_context_size (int): Future context size. Please note, the shape
            of convolution kernel is [future_context_size + 1, D].
        param_attr (ParamAttr): Attributes of parameters, including
L
lujun 已提交
384 385
            name, initializer etc. Default: None.
        act (str): Non-linear activation to be applied to output variable. Default: None.
386

387 388 389
    Attributes:
        weight (Parameter): the learnable weights of this layer.

390
    Returns:
L
lujun 已提交
391 392
        the output(Out) is a LodTensor, which supports variable time-length input sequences.
        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              x = numpy.random.random((16)).astype('float32')
              rowConv = fluid.dygraph.nn.RowConv(
                    'RowConv', future_context_size=2)
              ret = rowConv(fluid.dygraph.base.to_variable(x))

    """

408 409 410 411 412
    def __init__(
        self, name_scope, future_context_size, param_attr=None, act=None
    ):
        assert (
            not _non_static_mode()
413
        ), "RowConv is not supported by dynamic graph mode yet!"
414
        super().__init__(name_scope)
L
lujun 已提交
415 416 417 418
        self._act = act
        self._param_attr = param_attr
        self._future_context_size = future_context_size

419
    def _build_once(self, input):
L
lujun 已提交
420 421
        self._dtype = self._helper.input_dtype(input)
        filter_shape = [self._future_context_size + 1, input.shape[1]]
422 423 424 425 426 427
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            is_bias=False,
        )
L
lujun 已提交
428 429 430

    def forward(self, input):
        out = self._helper.create_variable_for_type_inference(self._dtype)
431 432 433 434 435
        self._helper.append_op(
            type='row_conv',
            inputs={'X': [input], 'Filter': [self.weight]},
            outputs={'Out': [out]},
        )
L
lujun 已提交
436
        return self._helper.append_activation(out, act=self._act)