nn.py 16.3 KB
Newer Older
M
minqiyang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import paddle
M
minqiyang 已提交
16 17
from .. import core
from ..layers import utils
18
from ..layers import nn as F
19
from .. import dygraph_utils
M
minqiyang 已提交
20
from . import layers
21 22 23 24 25 26 27 28 29 30 31 32
from ..framework import (
    Variable,
    _non_static_mode,
    OpProtoHolder,
    Parameter,
    _dygraph_tracer,
    _varbase_creator,
    default_main_program,
    _global_flags,
    in_dygraph_mode,
    _in_legacy_dygraph,
)
33

34 35 36 37 38 39
from ..data_feeder import (
    convert_dtype,
    check_variable_and_dtype,
    check_type,
    check_dtype,
)
M
minqiyang 已提交
40
from ..param_attr import ParamAttr
41
from ..initializer import Normal, Constant, NumpyArrayInitializer
H
hong 已提交
42 43
from .. import unique_name
from .layer_object_helper import LayerObjectHelper
44
from ..data_feeder import check_variable_and_dtype, check_type
L
lujun 已提交
45
import numpy as np
46
import numbers
47
import logging
48
import os
49
import paddle.utils.deprecated as deprecated
50
from paddle import _C_ops, _legacy_C_ops
51

52
__all__ = [
53
    'BatchNorm',
54
]
M
minqiyang 已提交
55 56


M
minqiyang 已提交
57
class BatchNorm(layers.Layer):
58
    r"""
59

60 61
    This interface is used to construct a callable object of the ``BatchNorm`` class.
    For more details, refer to code examples.
62
    It implements the function of the Batch Normalization Layer and can be used
63 64
    as a normalizer function for conv2d and fully connected operations.
    The data is normalized by the mean and variance of the channel based on the current batch data.
65 66 67 68
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

69
    When use_global_stats = False, the :math:`\mu_{\beta}`
70
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
71
    Calculated as follows:
72 73 74

    ..  math::

75 76 77 78
        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
        //\ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
        //\ mini-batch\ variance \\
79

80 81
    - :math:`x` : mini-batch data
    - :math:`m` : the size of the mini-batch data
82 83 84

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
85 86 87 88 89 90
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
91

92
    The normalization function formula is as follows:
93

94 95
    ..  math::

96 97 98 99
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

100

101 102 103
    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter
104

105
    Parameters:
106
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
T
tianshuo78520a 已提交
107
        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
108 109 110
        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
             Default: False.
111 112 113
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
114 115 116
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
117
        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
118 119 120
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
121 122 123 124 125 126
        dtype(str, optional): Indicate the data type of the input ``Tensor``,
             which can be float32 or float64. Default: float32.
        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
127 128
        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
            average when model average is enabled. Default: True.
129
        use_global_stats(bool, optional): Whether to use global mean and
130 131 132
            variance. In inference or test mode, set use_global_stats to true
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
133 134 135 136
            and variance are also used during train period. Default: False.
        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
            Default: False.
137 138

    Returns:
139
        None
140 141 142

    Examples:
        .. code-block:: python
L
lujun 已提交
143 144

          import paddle.fluid as fluid
145 146
          from paddle.fluid.dygraph.base import to_variable
          import numpy as np
L
lujun 已提交
147

148
          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
L
lujun 已提交
149
          with fluid.dygraph.guard():
150
              x = to_variable(x)
151
              batch_norm = fluid.BatchNorm(10)
152
              hidden1 = batch_norm(x)
153 154
    """

155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
    def __init__(
        self,
        num_channels,
        act=None,
        is_test=False,
        momentum=0.9,
        epsilon=1e-05,
        param_attr=None,
        bias_attr=None,
        dtype='float32',
        data_layout='NCHW',
        in_place=False,
        moving_mean_name=None,
        moving_variance_name=None,
        do_model_average_for_mean_and_var=True,
        use_global_stats=False,
        trainable_statistics=False,
    ):
173
        super().__init__()
174
        self._param_attr = param_attr
175
        self._bias_attr = bias_attr
176
        self._act = act
177
        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
M
minqiyang 已提交
178

179 180 181
        assert (
            bias_attr is not False
        ), "bias_attr should not be False in batch_norm."
M
minqiyang 已提交
182

183 184
        if dtype == "float16":
            self._dtype = "float32"
M
minqiyang 已提交
185 186 187 188 189 190
        else:
            self._dtype = dtype

        param_shape = [num_channels]

        # create parameter
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            default_initializer=Constant(1.0),
        )
        self.weight.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self.bias = self.create_parameter(
            attr=self._bias_attr,
            shape=param_shape,
            dtype=self._dtype,
            is_bias=True,
        )
        self.bias.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self._mean = self.create_parameter(
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
221
        self._mean.stop_gradient = True
M
minqiyang 已提交
222

223 224 225 226 227 228 229 230 231 232
        self._variance = self.create_parameter(
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
233
        self._variance.stop_gradient = True
M
minqiyang 已提交
234 235

        self._in_place = in_place
236
        self._data_layout = data_layout
M
minqiyang 已提交
237 238 239
        self._momentum = momentum
        self._epsilon = epsilon
        self._is_test = is_test
240
        self._fuse_with_relu = False
M
minqiyang 已提交
241
        self._use_global_stats = use_global_stats
242
        self._trainable_statistics = trainable_statistics
M
minqiyang 已提交
243 244 245 246 247 248 249

    def forward(self, input):
        # create output
        # mean and mean_out share the same memory
        mean_out = self._mean
        # variance and variance out share the same memory
        variance_out = self._variance
250

J
Jiabin Yang 已提交
251
        if _non_static_mode():
H
hong 已提交
252
            if in_dygraph_mode():
253
                batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
254 255 256
                    input,
                    self._mean,
                    self._variance,
257 258 259
                    self.weight,
                    self.bias,
                    not self.training,
260 261 262 263 264 265
                    self._momentum,
                    self._epsilon,
                    self._data_layout,
                    self._use_global_stats,
                    self._trainable_statistics,
                )
266
                return dygraph_utils._append_activation_in_dygraph(
267 268
                    batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
                )
269 270

            elif _in_legacy_dygraph():
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
                attrs = (
                    "momentum",
                    self._momentum,
                    "epsilon",
                    self._epsilon,
                    "is_test",
                    not self.training,
                    "data_layout",
                    self._data_layout,
                    "use_mkldnn",
                    self._use_mkldnn,
                    "fuse_with_relu",
                    self._fuse_with_relu,
                    "use_global_stats",
                    self._use_global_stats,
                    'trainable_statistics',
                    self._trainable_statistics,
                )
289
                batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
290 291 292 293 294 295 296 297 298 299
                    input,
                    self.weight,
                    self.bias,
                    self._mean,
                    self._variance,
                    None,
                    mean_out,
                    variance_out,
                    *attrs
                )
300

301
            return dygraph_utils._append_activation_in_dygraph(
302 303
                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
            )
304

305 306 307
        check_variable_and_dtype(
            input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
        )
308

309 310 311 312 313 314 315
        attrs = {
            "momentum": self._momentum,
            "epsilon": self._epsilon,
            "is_test": self._is_test,
            "data_layout": self._data_layout,
            "use_mkldnn": False,
            "fuse_with_relu": self._fuse_with_relu,
316 317
            "use_global_stats": self._use_global_stats,
            "trainable_statistics": self._trainable_statistics,
318
        }
M
minqiyang 已提交
319

320 321 322 323 324
        inputs = {
            "X": [input],
            "Scale": [self.weight],
            "Bias": [self.bias],
            "Mean": [self._mean],
325
            "Variance": [self._variance],
326 327
        }

328
        saved_mean = self._helper.create_variable_for_type_inference(
329 330
            dtype=self._dtype, stop_gradient=True
        )
331
        saved_variance = self._helper.create_variable_for_type_inference(
332 333
            dtype=self._dtype, stop_gradient=True
        )
334
        reserve_space = self._helper.create_variable_for_type_inference(
335 336
            dtype=self._helper.input_dtype(input), stop_gradient=True
        )
337

338 339 340 341 342
        batch_norm_out = (
            input
            if self._in_place
            else self._helper.create_variable_for_type_inference(self._dtype)
        )
343 344 345 346 347 348

        outputs = {
            "Y": [batch_norm_out],
            "MeanOut": [mean_out],
            "VarianceOut": [variance_out],
            "SavedMean": [saved_mean],
349
            "SavedVariance": [saved_variance],
350
        }
351
        if reserve_space is not None:
352
            outputs["ReserveSpace"] = [reserve_space]
353

354 355 356
        self._helper.append_op(
            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
        )
M
minqiyang 已提交
357

L
lujun 已提交
358
        # Currently, we don't support inplace in dygraph mode
359
        return self._helper.append_activation(batch_norm_out, self._act)
360 361


L
lujun 已提交
362
class RowConv(layers.Layer):
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
    """
    ***Row-convolution operator***

    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf

    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
    forward and a backward pass through the entire sequence. However, unlike
    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
    and low-latency setting. The lookahead convolution incorporates information
    from future subsequences in a computationally efficient manner to improve
    unidirectional recurrent neural networks. The row convolution operator is
    different from the 1D sequence convolution, and is computed as follows:

    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.

    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .

381
    Parameters:
L
lujun 已提交
382
        name_scope(str): The name of this class.
383 384 385
        future_context_size (int): Future context size. Please note, the shape
            of convolution kernel is [future_context_size + 1, D].
        param_attr (ParamAttr): Attributes of parameters, including
L
lujun 已提交
386 387
            name, initializer etc. Default: None.
        act (str): Non-linear activation to be applied to output variable. Default: None.
388

389 390 391
    Attributes:
        weight (Parameter): the learnable weights of this layer.

392
    Returns:
L
lujun 已提交
393 394
        the output(Out) is a LodTensor, which supports variable time-length input sequences.
        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              x = numpy.random.random((16)).astype('float32')
              rowConv = fluid.dygraph.nn.RowConv(
                    'RowConv', future_context_size=2)
              ret = rowConv(fluid.dygraph.base.to_variable(x))

    """

410 411 412 413 414
    def __init__(
        self, name_scope, future_context_size, param_attr=None, act=None
    ):
        assert (
            not _non_static_mode()
415
        ), "RowConv is not supported by dynamic graph mode yet!"
416
        super().__init__(name_scope)
L
lujun 已提交
417 418 419 420
        self._act = act
        self._param_attr = param_attr
        self._future_context_size = future_context_size

421
    def _build_once(self, input):
L
lujun 已提交
422 423
        self._dtype = self._helper.input_dtype(input)
        filter_shape = [self._future_context_size + 1, input.shape[1]]
424 425 426 427 428 429
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            is_bias=False,
        )
L
lujun 已提交
430 431 432

    def forward(self, input):
        out = self._helper.create_variable_for_type_inference(self._dtype)
433 434 435 436 437
        self._helper.append_op(
            type='row_conv',
            inputs={'X': [input], 'Filter': [self.weight]},
            outputs={'Out': [out]},
        )
L
lujun 已提交
438
        return self._helper.append_activation(out, act=self._act)