nn.py 15.4 KB
Newer Older
M
minqiyang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
import paddle
M
minqiyang 已提交
16 17
from .. import core
from ..layers import utils
18
from ..layers import nn as F
19
from .. import dygraph_utils
M
minqiyang 已提交
20
from . import layers
21 22 23 24 25 26 27 28 29 30
from ..framework import (
    Variable,
    OpProtoHolder,
    Parameter,
    _dygraph_tracer,
    _varbase_creator,
    default_main_program,
    _global_flags,
    in_dygraph_mode,
)
31

32 33 34 35 36 37
from ..data_feeder import (
    convert_dtype,
    check_variable_and_dtype,
    check_type,
    check_dtype,
)
38

M
minqiyang 已提交
39
from ..param_attr import ParamAttr
40
from ..initializer import Normal, Constant, NumpyArrayInitializer
H
hong 已提交
41 42
from .. import unique_name
from .layer_object_helper import LayerObjectHelper
43
from ..data_feeder import check_variable_and_dtype, check_type
L
lujun 已提交
44
import numpy as np
45
import numbers
46
import logging
47
import os
48
import paddle.utils.deprecated as deprecated
49
from paddle import _C_ops, _legacy_C_ops
50

51
__all__ = []
M
minqiyang 已提交
52 53


M
minqiyang 已提交
54
class BatchNorm(layers.Layer):
55
    r"""
56

57 58
    This interface is used to construct a callable object of the ``BatchNorm`` class.
    For more details, refer to code examples.
59
    It implements the function of the Batch Normalization Layer and can be used
60 61
    as a normalizer function for conv2d and fully connected operations.
    The data is normalized by the mean and variance of the channel based on the current batch data.
62 63 64 65
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.

66
    When use_global_stats = False, the :math:`\mu_{\beta}`
67
    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
68
    Calculated as follows:
69 70 71

    ..  math::

72 73 74 75
        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
        //\ mini-batch\ mean \\
        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
        //\ mini-batch\ variance \\
76

77 78
    - :math:`x` : mini-batch data
    - :math:`m` : the size of the mini-batch data
79 80 81

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
82 83 84 85 86 87
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
88

89
    The normalization function formula is as follows:
90

91 92
    ..  math::

93 94 95 96
        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift

97

98 99 100
    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\gamma` : trainable proportional parameter
    - :math:`\beta` : trainable deviation parameter
101

102
    Parameters:
103
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
T
tianshuo78520a 已提交
104
        act(str, optional): Activation to be applied to the output of batch normalization. Default: None.
105 106 107
        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
             This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
             Default: False.
108 109 110
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
111 112 113
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
114
        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
115 116 117
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
118 119
        dtype(str, optional): Indicate the data type of the input ``Tensor``,
             which can be float32 or float64. Default: float32.
学渣戊's avatar
学渣戊 已提交
120
        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
121 122 123
        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
124 125
        do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model
            average when model average is enabled. Default: True.
126
        use_global_stats(bool, optional): Whether to use global mean and
127 128 129
            variance. In inference or test mode, set use_global_stats to true
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
130 131 132 133
            and variance are also used during train period. Default: False.
        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
            Default: False.
134 135

    Returns:
136
        None
137 138 139

    Examples:
        .. code-block:: python
L
lujun 已提交
140

学渣戊's avatar
学渣戊 已提交
141
          import paddle
L
lujun 已提交
142
          import paddle.fluid as fluid
143
          from paddle.fluid.dygraph.base import to_variable
L
lujun 已提交
144

学渣戊's avatar
学渣戊 已提交
145
          x = paddle.rand([3, 10, 3, 7], 'float32')
L
lujun 已提交
146
          with fluid.dygraph.guard():
147
              x = to_variable(x)
148
              batch_norm = fluid.BatchNorm(10)
149
              hidden1 = batch_norm(x)
150 151
    """

152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
    def __init__(
        self,
        num_channels,
        act=None,
        is_test=False,
        momentum=0.9,
        epsilon=1e-05,
        param_attr=None,
        bias_attr=None,
        dtype='float32',
        data_layout='NCHW',
        in_place=False,
        moving_mean_name=None,
        moving_variance_name=None,
        do_model_average_for_mean_and_var=True,
        use_global_stats=False,
        trainable_statistics=False,
    ):
170
        super().__init__()
171
        self._param_attr = param_attr
172
        self._bias_attr = bias_attr
173
        self._act = act
174
        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
M
minqiyang 已提交
175

176 177 178
        assert (
            bias_attr is not False
        ), "bias_attr should not be False in batch_norm."
M
minqiyang 已提交
179

180 181
        if dtype == "float16":
            self._dtype = "float32"
M
minqiyang 已提交
182 183 184 185 186 187
        else:
            self._dtype = dtype

        param_shape = [num_channels]

        # create parameter
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            default_initializer=Constant(1.0),
        )
        self.weight.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self.bias = self.create_parameter(
            attr=self._bias_attr,
            shape=param_shape,
            dtype=self._dtype,
            is_bias=True,
        )
        self.bias.stop_gradient = (
            use_global_stats and self._param_attr.learning_rate == 0.0
        )

        self._mean = self.create_parameter(
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
218
        self._mean.stop_gradient = True
M
minqiyang 已提交
219

220 221 222 223 224 225 226 227 228 229
        self._variance = self.create_parameter(
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
                trainable=False,
                do_model_average=do_model_average_for_mean_and_var,
            ),
            shape=param_shape,
            dtype=self._dtype,
        )
230
        self._variance.stop_gradient = True
M
minqiyang 已提交
231 232

        self._in_place = in_place
233
        self._data_layout = data_layout
M
minqiyang 已提交
234 235 236
        self._momentum = momentum
        self._epsilon = epsilon
        self._is_test = is_test
237
        self._fuse_with_relu = False
M
minqiyang 已提交
238
        self._use_global_stats = use_global_stats
239
        self._trainable_statistics = trainable_statistics
M
minqiyang 已提交
240 241 242 243 244 245 246

    def forward(self, input):
        # create output
        # mean and mean_out share the same memory
        mean_out = self._mean
        # variance and variance out share the same memory
        variance_out = self._variance
247

姜永久 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261
        if in_dygraph_mode():
            batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm(
                input,
                self._mean,
                self._variance,
                self.weight,
                self.bias,
                not self.training,
                self._momentum,
                self._epsilon,
                self._data_layout,
                self._use_global_stats,
                self._trainable_statistics,
            )
262
            return dygraph_utils._append_activation_in_dygraph(
263 264
                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn
            )
姜永久 已提交
265 266 267 268
        else:
            check_variable_and_dtype(
                input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
            )
269

姜永久 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
            attrs = {
                "momentum": self._momentum,
                "epsilon": self._epsilon,
                "is_test": self._is_test,
                "data_layout": self._data_layout,
                "use_mkldnn": False,
                "fuse_with_relu": self._fuse_with_relu,
                "use_global_stats": self._use_global_stats,
                "trainable_statistics": self._trainable_statistics,
            }

            inputs = {
                "X": [input],
                "Scale": [self.weight],
                "Bias": [self.bias],
                "Mean": [self._mean],
                "Variance": [self._variance],
            }

            saved_mean = self._helper.create_variable_for_type_inference(
                dtype=self._dtype, stop_gradient=True
            )
            saved_variance = self._helper.create_variable_for_type_inference(
                dtype=self._dtype, stop_gradient=True
            )
            reserve_space = self._helper.create_variable_for_type_inference(
                dtype=self._helper.input_dtype(input), stop_gradient=True
            )
298

姜永久 已提交
299 300 301 302 303 304 305
            batch_norm_out = (
                input
                if self._in_place
                else self._helper.create_variable_for_type_inference(
                    self._dtype
                )
            )
306

姜永久 已提交
307 308 309 310 311 312 313 314 315 316 317 318 319
            outputs = {
                "Y": [batch_norm_out],
                "MeanOut": [mean_out],
                "VarianceOut": [variance_out],
                "SavedMean": [saved_mean],
                "SavedVariance": [saved_variance],
            }
            if reserve_space is not None:
                outputs["ReserveSpace"] = [reserve_space]

            self._helper.append_op(
                type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
            )
M
minqiyang 已提交
320

姜永久 已提交
321 322
            # Currently, we don't support inplace in dygraph mode
            return self._helper.append_activation(batch_norm_out, self._act)
323 324


L
lujun 已提交
325
class RowConv(layers.Layer):
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
    """
    ***Row-convolution operator***

    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf

    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
    forward and a backward pass through the entire sequence. However, unlike
    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
    and low-latency setting. The lookahead convolution incorporates information
    from future subsequences in a computationally efficient manner to improve
    unidirectional recurrent neural networks. The row convolution operator is
    different from the 1D sequence convolution, and is computed as follows:

    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.

    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .

344
    Parameters:
L
lujun 已提交
345
        name_scope(str): The name of this class.
346 347 348
        future_context_size (int): Future context size. Please note, the shape
            of convolution kernel is [future_context_size + 1, D].
        param_attr (ParamAttr): Attributes of parameters, including
L
lujun 已提交
349 350
            name, initializer etc. Default: None.
        act (str): Non-linear activation to be applied to output variable. Default: None.
351

352 353 354
    Attributes:
        weight (Parameter): the learnable weights of this layer.

355
    Returns:
L
lujun 已提交
356 357
        the output(Out) is a LodTensor, which supports variable time-length input sequences.
        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372

    Examples:
        .. code-block:: python

          import paddle.fluid as fluid
          import numpy

          with fluid.dygraph.guard():
              x = numpy.random.random((16)).astype('float32')
              rowConv = fluid.dygraph.nn.RowConv(
                    'RowConv', future_context_size=2)
              ret = rowConv(fluid.dygraph.base.to_variable(x))

    """

373 374 375 376
    def __init__(
        self, name_scope, future_context_size, param_attr=None, act=None
    ):
        assert (
姜永久 已提交
377
            not in_dygraph_mode()
378
        ), "RowConv is not supported by dynamic graph mode yet!"
379
        super().__init__(name_scope)
L
lujun 已提交
380 381 382 383
        self._act = act
        self._param_attr = param_attr
        self._future_context_size = future_context_size

384
    def _build_once(self, input):
L
lujun 已提交
385 386
        self._dtype = self._helper.input_dtype(input)
        filter_shape = [self._future_context_size + 1, input.shape[1]]
387 388 389 390 391 392
        self.weight = self.create_parameter(
            attr=self._param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            is_bias=False,
        )
L
lujun 已提交
393 394 395

    def forward(self, input):
        out = self._helper.create_variable_for_type_inference(self._dtype)
396 397 398 399 400
        self._helper.append_op(
            type='row_conv',
            inputs={'X': [input], 'Filter': [self.weight]},
            outputs={'Out': [out]},
        )
L
lujun 已提交
401
        return self._helper.append_activation(out, act=self._act)