diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 69d27e2c2341b87895b6eba8d83ff2cb3819eec8..dc3403358b6af25d5da001282fffe53be8bfd3d9 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -36,7 +36,7 @@ __all__ = [ 'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding', 'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm', - 'SpectralNorm', 'TreeConv', 'Flatten', 'SyncBatchNorm' + 'SpectralNorm', 'TreeConv', 'Flatten' ] @@ -3203,220 +3203,6 @@ class TreeConv(layers.Layer): return self._helper.append_activation(pre_activation, act=self._act) -class SyncBatchNorm(layers.Layer): - """ - This interface is used to construct a callable object of the ``SyncBatchNorm`` class. - It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can - be used as a normalizer function for other operations, such as conv2d and fully connected - operations. - The data is normalized by the mean and variance of the channel based on whole mini-batch - , which including data in all gpus. - Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing - Internal Covariate Shift `_ - for more details. - - When model in training mode, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus. - Calculated as follows: - - .. math:: - - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ - - - :math:`x` : whole mini-batch data in all gpus - - :math:`m` : the size of the whole mini-batch data - - When model in evaluation mode, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, - which usually got from the pre-trained model). Global statistics calculated as follows: - - .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ - - The formula of normalization is as follows: - - .. math:: - - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift - - - :math:`\\eps` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable scale parameter vector - - :math:`\\beta` : trainable shift parameter vector - - Parameters: - num_features(int): Indicate the number of channels of the input ``Tensor``. - epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. - momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. - weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` - of this layer. If it is set to None or one attribute of ParamAttr, this layerr - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. If it is set to False, - this layer will not have trainable scale parameter. Default: None. - bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer. - If it is set to None or one attribute of ParamAttr, this layer - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. If it is set to False, this layer will not - have trainable bias parameter. Default: None. - track_running_stats(bool, optional): Whether to compute global stats, which including running mean and - running variance. Default: True. - - Returns: - None - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - import numpy as np - - x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') - paddle.disable_static() - x = paddle.to_tensor(x) - if paddle.fluid.is_compiled_with_cuda(): - sync_batch_norm = nn.SyncBatchNorm(2) - hidden1 = sync_batch_norm(x) - print(hidden1.numpy()) - # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] - """ - - def __init__(self, - num_features, - epsilon=1e-05, - momentum=0.9, - track_running_stats=True, - weight_attr=None, - bias_attr=None, - data_format='NCHW', - name=None): - super(SyncBatchNorm, self).__init__() - self._weight_attr = weight_attr - self._bias_attr = bias_attr - self._num_features = num_features - self._data_layout = data_format - self._momentum = momentum - self._epsilon = epsilon - self._track_running_stats = track_running_stats - - if self._track_running_stats == False: - logging.warn( - "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version." - ) - - param_shape = [self._num_features] - - # create parameter - if weight_attr == False: - self.weight = self.create_parameter( - attr=None, shape=param_shape, default_initializer=Constant(1.0)) - self.weight.stop_gradient = True - else: - self.weight = self.create_parameter( - attr=self._weight_attr, - shape=param_shape, - default_initializer=Constant(1.0)) - self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. - - if bias_attr == False: - self.bias = self.create_parameter( - attr=None, - shape=param_shape, - default_initializer=Constant(0.0), - is_bias=True) - self.bias.stop_gradient = True - else: - self.bias = self.create_parameter( - attr=self._bias_attr, shape=param_shape, is_bias=True) - self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. - - self._mean = self.create_parameter( - attr=ParamAttr( - name=None, - initializer=Constant(0.0), - trainable=False, - do_model_average=True), - shape=param_shape, - dtype=self._dtype) - self._mean.stop_gradient = True - - self._variance = self.create_parameter( - attr=ParamAttr( - name=None, - initializer=Constant(1.0), - trainable=False, - do_model_average=True), - shape=param_shape, - dtype=self._dtype) - self._variance.stop_gradient = True - - def forward(self, x): - # create output - # mean and mean_out share the same memory - mean_out = self._mean - # variance and variance out share the same memory - variance_out = self._variance - - ### train mode: use mini-batch stats, eval mode: use global stats - if in_dygraph_mode(): - attrs = ("momentum", self._momentum, "epsilon", self._epsilon, - "is_test", not self.training, "data_layout", - self._data_layout, "use_mkldnn", False, "fuse_with_relu", - False, "use_global_stats", not self.training, - 'trainable_statistics', False) - sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm( - x, self.weight, self.bias, self._mean, self._variance, mean_out, - variance_out, *attrs) - - return sync_batch_norm_out - - check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], - 'BatchNorm') - - attrs = { - "momentum": self._momentum, - "epsilon": self._epsilon, - "is_test": not self.training, - "data_layout": self._data_layout, - "use_mkldnn": False, - "fuse_with_relu": False, - "use_global_stats": not self.training, - "trainable_statistics": False, - } - - inputs = { - "X": [x], - "Scale": [self.weight], - "Bias": [self.bias], - "Mean": [self._mean], - "Variance": [self._variance] - } - - saved_mean = self._helper.create_variable_for_type_inference( - dtype=self._dtype, stop_gradient=True) - saved_variance = self._helper.create_variable_for_type_inference( - dtype=self._dtype, stop_gradient=True) - sync_batch_norm_out = self._helper.create_variable_for_type_inference( - self._dtype) - - outputs = { - "Y": [sync_batch_norm_out], - "MeanOut": [mean_out], - "VarianceOut": [variance_out], - "SavedMean": [saved_mean], - "SavedVariance": [saved_variance] - } - - self._helper.append_op( - type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) - return sync_batch_norm_out - - class Flatten(layers.Layer): """ :alias_main: paddle.nn.Flatten diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 4ce7bd693f3deb2b62fe4abef8e35aca56c42532..938f33dd71ee93067a5f67709002344ac3bb0c0f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -287,7 +287,7 @@ class TestLayer(LayerTest): if core.is_compiled_with_cuda(): with self.static_graph(): t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32') - my_sync_bn = nn.SyncBatchNorm(3) + my_sync_bn = paddle.nn.SyncBatchNorm(3) ret = my_sync_bn(t) static_ret = self.get_static_graph_result( feed={'t': np.ones( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py index 5c34b35fc83a3de6e1a33a51dad1e4e264afd52b..7cf1e9711b74b31e15b732f87addbc9fa653152f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py @@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase): def _setup_config(self): self._sync_mode = False self._nccl2_mode = True - self._dygraph = False #True + self._dygraph = True def test_mnist(self): if fluid.core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py index 806b6b90e7e2d39a3d4e5f3792cd849022097777..b0701a9b187f6c7cf63f43d69f482ea13e6d3fe3 100644 --- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -22,6 +22,7 @@ import unittest import numpy as np import os import six +import paddle import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler @@ -209,7 +210,7 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase): return with program_guard(Program(), Program()): - my_sync_batch_norm = fluid.dygraph.SyncBatchNorm(10) + my_sync_batch_norm = paddle.nn.SyncBatchNorm(10) x1 = fluid.create_lod_tensor( np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0)) self.assertRaises(TypeError, my_sync_batch_norm, x1) diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 1d00f9c7b8b0204affed690bcea2f0ff78a943d1..369d462a8089a30e6b749ef472aad66166cb590d 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -14,15 +14,239 @@ # TODO: define normalization api +import warnings from ...fluid.dygraph.nn import InstanceNorm from ...fluid.dygraph import BatchNorm #DEFINE_ALIAS from ...fluid.dygraph import GroupNorm #DEFINE_ALIAS from ...fluid.dygraph import LayerNorm #DEFINE_ALIAS from ...fluid.dygraph import SpectralNorm #DEFINE_ALIAS -from ...fluid.dygraph import SyncBatchNorm #DEFINE_ALIAS + +from ...fluid.dygraph import layers +from ...fluid.framework import in_dygraph_mode + +from ...fluid.initializer import Constant +from ...fluid.param_attr import ParamAttr +from ...fluid.data_feeder import check_variable_and_dtype, check_type +from ...fluid import core __all__ = [ 'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm', 'SyncBatchNorm' ] + + +class SyncBatchNorm(layers.Layer): + """ + This interface is used to construct a callable object of the ``SyncBatchNorm`` class. + It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can + be used as a normalizer function for other operations, such as conv2d and fully connected + operations. + The data is normalized by the mean and variance of the channel based on whole mini-batch + , which including data in all gpus. + Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `_ + for more details. + + When model in training mode, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus. + Calculated as follows: + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + + - :math:`x` : whole mini-batch data in all gpus + - :math:`m` : the size of the whole mini-batch data + + When model in evaluation mode, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, + which usually got from the pre-trained model). Global statistics calculated as follows: + + .. math:: + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + + The formula of normalization is as follows: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + - :math:`\\eps` : add a smaller value to the variance to prevent division by zero + - :math:`\\gamma` : trainable scale parameter vector + - :math:`\\beta` : trainable shift parameter vector + + Parameters: + num_features(int): Indicate the number of channels of the input ``Tensor``. + epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5. + momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9. + weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale` + of this layer. If it is set to None or one attribute of ParamAttr, this layerr + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. If it is set to False, + this layer will not have trainable scale parameter. Default: None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer. + If it is set to None or one attribute of ParamAttr, this layer + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. If it is set to False, this layer will not + have trainable bias parameter. Default: None. + track_running_stats(bool, optional): Whether to compute global stats, which including running mean and + running variance. Default: True. + + Shapes: + input: Tensor that the dimension from 2 to 5. + output: Tensor with the same shape as input. + + Examples: + .. code-block:: python + + import paddle + import paddle.nn as nn + import numpy as np + + x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') + paddle.disable_static() + x = paddle.to_tensor(x) + if paddle.fluid.is_compiled_with_cuda(): + sync_batch_norm = nn.SyncBatchNorm(2) + hidden1 = sync_batch_norm(x) + print(hidden1.numpy()) + # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]] + """ + + def __init__(self, + num_features, + epsilon=1e-05, + momentum=0.9, + track_running_stats=True, + weight_attr=None, + bias_attr=None, + data_format='NCHW', + name=None): + super(SyncBatchNorm, self).__init__() + self._weight_attr = weight_attr + self._bias_attr = bias_attr + self._num_features = num_features + self._data_layout = data_format + self._momentum = momentum + self._epsilon = epsilon + self._track_running_stats = track_running_stats + + if self._track_running_stats == False: + warnings.warn( + "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version." + ) + + param_shape = [self._num_features] + + # create parameter + if weight_attr == False: + self.weight = self.create_parameter( + attr=None, shape=param_shape, default_initializer=Constant(1.0)) + self.weight.stop_gradient = True + else: + self.weight = self.create_parameter( + attr=self._weight_attr, + shape=param_shape, + default_initializer=Constant(1.0)) + self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. + + if bias_attr == False: + self.bias = self.create_parameter( + attr=None, + shape=param_shape, + default_initializer=Constant(0.0), + is_bias=True) + self.bias.stop_gradient = True + else: + self.bias = self.create_parameter( + attr=self._bias_attr, shape=param_shape, is_bias=True) + self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. + + self._mean = self.create_parameter( + attr=ParamAttr( + name=None, + initializer=Constant(0.0), + trainable=False, + do_model_average=True), + shape=param_shape, + dtype=self._dtype) + self._mean.stop_gradient = True + + self._variance = self.create_parameter( + attr=ParamAttr( + name=None, + initializer=Constant(1.0), + trainable=False, + do_model_average=True), + shape=param_shape, + dtype=self._dtype) + self._variance.stop_gradient = True + + def forward(self, x): + # create output + # mean and mean_out share the same memory + mean_out = self._mean + # variance and variance out share the same memory + variance_out = self._variance + + ### train mode: use mini-batch stats, eval mode: use global stats + ### use_global_stats only support False in sync_batch_norm + if in_dygraph_mode(): + attrs = ("momentum", self._momentum, "epsilon", self._epsilon, + "is_test", not self.training, "data_layout", + self._data_layout, "use_mkldnn", False, "fuse_with_relu", + False, "use_global_stats", False, 'trainable_statistics', + False) + sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm( + x, self.weight, self.bias, self._mean, self._variance, mean_out, + variance_out, *attrs) + + return sync_batch_norm_out + + check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], + 'BatchNorm') + + attrs = { + "momentum": self._momentum, + "epsilon": self._epsilon, + "is_test": not self.training, + "data_layout": self._data_layout, + "use_mkldnn": False, + "fuse_with_relu": False, + "use_global_stats": False, + "trainable_statistics": False, + } + + inputs = { + "X": [x], + "Scale": [self.weight], + "Bias": [self.bias], + "Mean": [self._mean], + "Variance": [self._variance] + } + + saved_mean = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + saved_variance = self._helper.create_variable_for_type_inference( + dtype=self._dtype, stop_gradient=True) + sync_batch_norm_out = self._helper.create_variable_for_type_inference( + self._dtype) + + outputs = { + "Y": [sync_batch_norm_out], + "MeanOut": [mean_out], + "VarianceOut": [variance_out], + "SavedMean": [saved_mean], + "SavedVariance": [saved_variance] + } + + self._helper.append_op( + type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) + return sync_batch_norm_out