# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from .. import core from ..layers import utils from ..layers import nn as F from .. import dygraph_utils from . import layers from ..framework import ( Variable, _non_static_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program, _global_flags, in_dygraph_mode, _in_legacy_dygraph, ) from ..data_feeder import ( convert_dtype, check_variable_and_dtype, check_type, check_dtype, ) from ..param_attr import ParamAttr from ..initializer import Normal, Constant, NumpyArrayInitializer from .. import unique_name from .layer_object_helper import LayerObjectHelper from ..data_feeder import check_variable_and_dtype, check_type import numpy as np import numbers import logging import os import paddle.utils.deprecated as deprecated from paddle import _C_ops, _legacy_C_ops __all__ = [ 'BatchNorm', ] class BatchNorm(layers.Layer): r""" This interface is used to construct a callable object of the ``BatchNorm`` class. For more details, refer to code examples. It implements the function of the Batch Normalization Layer and can be used as a normalizer function for conv2d and fully connected operations. The data is normalized by the mean and variance of the channel based on the current batch data. Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift `_ for more details. When use_global_stats = False, the :math:`\mu_{\beta}` and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad & //\ mini-batch\ mean \\ \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad & //\ mini-batch\ variance \\ - :math:`x` : mini-batch data - :math:`m` : the size of the mini-batch data When use_global_stats = True, the :math:`\\mu_{\\beta}` and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. They are global or running statistics (moving_mean and moving_variance). It usually got from the pre-trained model. Calculated as follows: .. math:: moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ The normalization function formula is as follows: .. math:: \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero - :math:`\gamma` : trainable proportional parameter - :math:`\beta` : trainable deviation parameter Parameters: num_channels(int): Indicate the number of channels of the input ``Tensor``. act(str, optional): Activation to be applied to the output of batch normalization. Default: None. is_test (bool, optional): A flag indicating whether it is in test phrase or not. This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``. Default: False. momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. dtype(str, optional): Indicate the data type of the input ``Tensor``, which can be float32 or float64. Default: float32. data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW. in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False. moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None. moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None. do_model_average_for_mean_and_var(bool, optional): Whether parameter mean and variance should do model average when model average is enabled. Default: True. use_global_stats(bool, optional): Whether to use global mean and variance. In inference or test mode, set use_global_stats to true or is_test to true, and the behavior is equivalent. In train mode, when setting use_global_stats True, the global mean and variance are also used during train period. Default: False. trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when setting trainable_statistics True, mean and variance will be calculated by current batch statistics. Default: False. Returns: None Examples: .. code-block:: python import paddle.fluid as fluid from paddle.fluid.dygraph.base import to_variable import numpy as np x = np.random.random(size=(3, 10, 3, 7)).astype('float32') with fluid.dygraph.guard(): x = to_variable(x) batch_norm = fluid.BatchNorm(10) hidden1 = batch_norm(x) """ def __init__( self, num_channels, act=None, is_test=False, momentum=0.9, epsilon=1e-05, param_attr=None, bias_attr=None, dtype='float32', data_layout='NCHW', in_place=False, moving_mean_name=None, moving_variance_name=None, do_model_average_for_mean_and_var=True, use_global_stats=False, trainable_statistics=False, ): super().__init__() self._param_attr = param_attr self._bias_attr = bias_attr self._act = act self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"] assert ( bias_attr is not False ), "bias_attr should not be False in batch_norm." if dtype == "float16": self._dtype = "float32" else: self._dtype = dtype param_shape = [num_channels] # create parameter self.weight = self.create_parameter( attr=self._param_attr, shape=param_shape, dtype=self._dtype, default_initializer=Constant(1.0), ) self.weight.stop_gradient = ( use_global_stats and self._param_attr.learning_rate == 0.0 ) self.bias = self.create_parameter( attr=self._bias_attr, shape=param_shape, dtype=self._dtype, is_bias=True, ) self.bias.stop_gradient = ( use_global_stats and self._param_attr.learning_rate == 0.0 ) self._mean = self.create_parameter( attr=ParamAttr( name=moving_mean_name, initializer=Constant(0.0), trainable=False, do_model_average=do_model_average_for_mean_and_var, ), shape=param_shape, dtype=self._dtype, ) self._mean.stop_gradient = True self._variance = self.create_parameter( attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), trainable=False, do_model_average=do_model_average_for_mean_and_var, ), shape=param_shape, dtype=self._dtype, ) self._variance.stop_gradient = True self._in_place = in_place self._data_layout = data_layout self._momentum = momentum self._epsilon = epsilon self._is_test = is_test self._fuse_with_relu = False self._use_global_stats = use_global_stats self._trainable_statistics = trainable_statistics def forward(self, input): # create output # mean and mean_out share the same memory mean_out = self._mean # variance and variance out share the same memory variance_out = self._variance if _non_static_mode(): if in_dygraph_mode(): batch_norm_out, t1, t2, t3, t4, _ = _C_ops.batch_norm( input, self._mean, self._variance, self.weight, self.bias, not self.training, self._momentum, self._epsilon, self._data_layout, self._use_global_stats, self._trainable_statistics, ) return dygraph_utils._append_activation_in_dygraph( batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn ) elif _in_legacy_dygraph(): attrs = ( "momentum", self._momentum, "epsilon", self._epsilon, "is_test", not self.training, "data_layout", self._data_layout, "use_mkldnn", self._use_mkldnn, "fuse_with_relu", self._fuse_with_relu, "use_global_stats", self._use_global_stats, 'trainable_statistics', self._trainable_statistics, ) batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm( input, self.weight, self.bias, self._mean, self._variance, None, mean_out, variance_out, *attrs ) return dygraph_utils._append_activation_in_dygraph( batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn ) check_variable_and_dtype( input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm' ) attrs = { "momentum": self._momentum, "epsilon": self._epsilon, "is_test": self._is_test, "data_layout": self._data_layout, "use_mkldnn": False, "fuse_with_relu": self._fuse_with_relu, "use_global_stats": self._use_global_stats, "trainable_statistics": self._trainable_statistics, } inputs = { "X": [input], "Scale": [self.weight], "Bias": [self.bias], "Mean": [self._mean], "Variance": [self._variance], } saved_mean = self._helper.create_variable_for_type_inference( dtype=self._dtype, stop_gradient=True ) saved_variance = self._helper.create_variable_for_type_inference( dtype=self._dtype, stop_gradient=True ) reserve_space = self._helper.create_variable_for_type_inference( dtype=self._helper.input_dtype(input), stop_gradient=True ) batch_norm_out = ( input if self._in_place else self._helper.create_variable_for_type_inference(self._dtype) ) outputs = { "Y": [batch_norm_out], "MeanOut": [mean_out], "VarianceOut": [variance_out], "SavedMean": [saved_mean], "SavedVariance": [saved_variance], } if reserve_space is not None: outputs["ReserveSpace"] = [reserve_space] self._helper.append_op( type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs ) # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(batch_norm_out, self._act) class RowConv(layers.Layer): """ ***Row-convolution operator*** The row convolution is called lookahead convolution. This operator was introduced in the following paper for DeepSpeech2: http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a forward and a backward pass through the entire sequence. However, unlike unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online and low-latency setting. The lookahead convolution incorporates information from future subsequences in a computationally efficient manner to improve unidirectional recurrent neural networks. The row convolution operator is different from the 1D sequence convolution, and is computed as follows: Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D. More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 . Parameters: name_scope(str): The name of this class. future_context_size (int): Future context size. Please note, the shape of convolution kernel is [future_context_size + 1, D]. param_attr (ParamAttr): Attributes of parameters, including name, initializer etc. Default: None. act (str): Non-linear activation to be applied to output variable. Default: None. Attributes: weight (Parameter): the learnable weights of this layer. Returns: the output(Out) is a LodTensor, which supports variable time-length input sequences. The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X. Examples: .. code-block:: python import paddle.fluid as fluid import numpy with fluid.dygraph.guard(): x = numpy.random.random((16)).astype('float32') rowConv = fluid.dygraph.nn.RowConv( 'RowConv', future_context_size=2) ret = rowConv(fluid.dygraph.base.to_variable(x)) """ def __init__( self, name_scope, future_context_size, param_attr=None, act=None ): assert ( not _non_static_mode() ), "RowConv is not supported by dynamic graph mode yet!" super().__init__(name_scope) self._act = act self._param_attr = param_attr self._future_context_size = future_context_size def _build_once(self, input): self._dtype = self._helper.input_dtype(input) filter_shape = [self._future_context_size + 1, input.shape[1]] self.weight = self.create_parameter( attr=self._param_attr, shape=filter_shape, dtype=self._dtype, is_bias=False, ) def forward(self, input): out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type='row_conv', inputs={'X': [input], 'Filter': [self.weight]}, outputs={'Out': [out]}, ) return self._helper.append_activation(out, act=self._act)