fix syncbn, test=develop (#26523)

* fix syncbn, test=develop * fix test,test=develop * fix unittest,test=develop

fix syncbn, test=develop (#26523)
* fix syncbn, test=develop * fix test,test=develop * fix unittest,test=develop
f8dbd7a2 · ceci3 · GitHub · dd3df693 · f8dbd7a2 · f8dbd7a2
5 changed file
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -36,7 +36,7 @@ __all__ = [
    'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
    'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
    'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
-    'SpectralNorm', 'TreeConv', 'Flatten', 'SyncBatchNorm'
+    'SpectralNorm', 'TreeConv', 'Flatten'
 ]


@@ -3203,220 +3203,6 @@ class TreeConv(layers.Layer):
        return self._helper.append_activation(pre_activation, act=self._act)


-class SyncBatchNorm(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
-    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
-    be used as a normalizer function for other operations, such as conv2d and fully connected 
-    operations.
-    The data is normalized by the mean and variance of the channel based on whole mini-batch
-    , which including data in all gpus.
-    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
-    for more details.
-
-    When model in training mode, the :math:`\\mu_{\\beta}` 
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
-    Calculated as follows:
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-
-    - :math:`x` : whole mini-batch data in all gpus
-    - :math:`m` : the size of the whole mini-batch data
-
-    When model in evaluation mode, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
-    which usually got from the pre-trained model). Global statistics calculated as follows:
-
-    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
-
-    The formula of normalization is as follows:
- 
-    ..  math::
-
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable scale parameter vector
-    - :math:`\\beta` : trainable shift parameter vector 
-
-    Parameters:
-        num_features(int): Indicate the number of channels of the input ``Tensor``.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. If it is set to False, 
-             this layer will not have trainable scale parameter. Default: None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
-             If it is set to None or one attribute of ParamAttr, this layer
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. If it is set to False, this layer will not 
-             have trainable bias parameter. Default: None.
-        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
-             running variance. Default: True.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
-          paddle.disable_static()
-          x = paddle.to_tensor(x)
-          if paddle.fluid.is_compiled_with_cuda():
-              sync_batch_norm = nn.SyncBatchNorm(2)
-              hidden1 = sync_batch_norm(x)
-              print(hidden1.numpy())
-              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
-    """
-
-    def __init__(self,
-                 num_features,
-                 epsilon=1e-05,
-                 momentum=0.9,
-                 track_running_stats=True,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
-        super(SyncBatchNorm, self).__init__()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._num_features = num_features
-        self._data_layout = data_format
-        self._momentum = momentum
-        self._epsilon = epsilon
-        self._track_running_stats = track_running_stats
-
-        if self._track_running_stats == False:
-            logging.warn(
-                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
-            )
-
-        param_shape = [self._num_features]
-
-        # create parameter
-        if weight_attr == False:
-            self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
-            self.weight.stop_gradient = True
-        else:
-            self.weight = self.create_parameter(
-                attr=self._weight_attr,
-                shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        if bias_attr == False:
-            self.bias = self.create_parameter(
-                attr=None,
-                shape=param_shape,
-                default_initializer=Constant(0.0),
-                is_bias=True)
-            self.bias.stop_gradient = True
-        else:
-            self.bias = self.create_parameter(
-                attr=self._bias_attr, shape=param_shape, is_bias=True)
-            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
-
-        self._mean = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(0.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._mean.stop_gradient = True
-
-        self._variance = self.create_parameter(
-            attr=ParamAttr(
-                name=None,
-                initializer=Constant(1.0),
-                trainable=False,
-                do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
-        self._variance.stop_gradient = True
-
-    def forward(self, x):
-        # create output
-        # mean and mean_out share the same memory
-        mean_out = self._mean
-        # variance and variance out share the same memory
-        variance_out = self._variance
-
-        ### train mode: use mini-batch stats, eval mode: use global stats
-        if in_dygraph_mode():
-            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-                     "is_test", not self.training, "data_layout",
-                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
-                     False, "use_global_stats", not self.training,
-                     'trainable_statistics', False)
-            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
-                x, self.weight, self.bias, self._mean, self._variance, mean_out,
-                variance_out, *attrs)
-
-            return sync_batch_norm_out
-
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'BatchNorm')
-
-        attrs = {
-            "momentum": self._momentum,
-            "epsilon": self._epsilon,
-            "is_test": not self.training,
-            "data_layout": self._data_layout,
-            "use_mkldnn": False,
-            "fuse_with_relu": False,
-            "use_global_stats": not self.training,
-            "trainable_statistics": False,
-        }
-
-        inputs = {
-            "X": [x],
-            "Scale": [self.weight],
-            "Bias": [self.bias],
-            "Mean": [self._mean],
-            "Variance": [self._variance]
-        }
-
-        saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
-        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
-
-        outputs = {
-            "Y": [sync_batch_norm_out],
-            "MeanOut": [mean_out],
-            "VarianceOut": [variance_out],
-            "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance]
-        }
-
-        self._helper.append_op(
-            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
-        return sync_batch_norm_out
-
-
 class Flatten(layers.Layer):
    """
    :alias_main: paddle.nn.Flatten

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -287,7 +287,7 @@ class TestLayer(LayerTest):
        if core.is_compiled_with_cuda():
            with self.static_graph():
                t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
-                my_sync_bn = nn.SyncBatchNorm(3)
+                my_sync_bn = paddle.nn.SyncBatchNorm(3)
                ret = my_sync_bn(t)
                static_ret = self.get_static_graph_result(
                    feed={'t': np.ones(

--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sync_batch_norm.py
@@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase):
    def _setup_config(self):
        self._sync_mode = False
        self._nccl2_mode = True
-        self._dygraph = False  #True
+        self._dygraph = True

    def test_mnist(self):
        if fluid.core.is_compiled_with_cuda():

--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -22,6 +22,7 @@ import unittest
 import numpy as np
 import os
 import six
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
@@ -209,7 +210,7 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
            return

        with program_guard(Program(), Program()):
-            my_sync_batch_norm = fluid.dygraph.SyncBatchNorm(10)
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
            x1 = fluid.create_lod_tensor(
                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
            self.assertRaises(TypeError, my_sync_batch_norm, x1)

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -14,15 +14,239 @@

 # TODO: define normalization api  

+import warnings
 from ...fluid.dygraph.nn import InstanceNorm

 from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
 from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import SyncBatchNorm  #DEFINE_ALIAS
+
+from ...fluid.dygraph import layers
+from ...fluid.framework import in_dygraph_mode
+
+from ...fluid.initializer import Constant
+from ...fluid.param_attr import ParamAttr
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid import core

 __all__ = [
    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
    'SyncBatchNorm'
 ]
+
+
+class SyncBatchNorm(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
+    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
+    be used as a normalizer function for other operations, such as conv2d and fully connected 
+    operations.
+    The data is normalized by the mean and variance of the channel based on whole mini-batch
+    , which including data in all gpus.
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    When model in training mode, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
+    Calculated as follows:
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+
+    - :math:`x` : whole mini-batch data in all gpus
+    - :math:`m` : the size of the whole mini-batch data
+
+    When model in evaluation mode, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    which usually got from the pre-trained model). Global statistics calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+
+    The formula of normalization is as follows:
+ 
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\\beta` : trainable shift parameter vector 
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. If it is set to False, 
+             this layer will not have trainable scale parameter. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
+             If it is set to None or one attribute of ParamAttr, this layer
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. If it is set to False, this layer will not 
+             have trainable bias parameter. Default: None.
+        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
+             running variance. Default: True.
+
+    Shapes:
+        input: Tensor that the dimension from 2 to 5.
+        output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+
+          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+          paddle.disable_static()
+          x = paddle.to_tensor(x)
+          if paddle.fluid.is_compiled_with_cuda():
+              sync_batch_norm = nn.SyncBatchNorm(2)
+              hidden1 = sync_batch_norm(x)
+              print(hidden1.numpy())
+              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
+    """
+
+    def __init__(self,
+                 num_features,
+                 epsilon=1e-05,
+                 momentum=0.9,
+                 track_running_stats=True,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW',
+                 name=None):
+        super(SyncBatchNorm, self).__init__()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._num_features = num_features
+        self._data_layout = data_format
+        self._momentum = momentum
+        self._epsilon = epsilon
+        self._track_running_stats = track_running_stats
+
+        if self._track_running_stats == False:
+            warnings.warn(
+                "moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
+            )
+
+        param_shape = [self._num_features]
+
+        # create parameter
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+
+        self._mean = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._mean.stop_gradient = True
+
+        self._variance = self.create_parameter(
+            attr=ParamAttr(
+                name=None,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True),
+            shape=param_shape,
+            dtype=self._dtype)
+        self._variance.stop_gradient = True
+
+    def forward(self, x):
+        # create output
+        # mean and mean_out share the same memory
+        mean_out = self._mean
+        # variance and variance out share the same memory
+        variance_out = self._variance
+
+        ### train mode: use mini-batch stats, eval mode: use global stats
+        ### use_global_stats only support False in sync_batch_norm
+        if in_dygraph_mode():
+            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
+                     "is_test", not self.training, "data_layout",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
+                     False, "use_global_stats", False, 'trainable_statistics',
+                     False)
+            sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
+                x, self.weight, self.bias, self._mean, self._variance, mean_out,
+                variance_out, *attrs)
+
+            return sync_batch_norm_out
+
+        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                                 'BatchNorm')
+
+        attrs = {
+            "momentum": self._momentum,
+            "epsilon": self._epsilon,
+            "is_test": not self.training,
+            "data_layout": self._data_layout,
+            "use_mkldnn": False,
+            "fuse_with_relu": False,
+            "use_global_stats": False,
+            "trainable_statistics": False,
+        }
+
+        inputs = {
+            "X": [x],
+            "Scale": [self.weight],
+            "Bias": [self.bias],
+            "Mean": [self._mean],
+            "Variance": [self._variance]
+        }
+
+        saved_mean = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        saved_variance = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        sync_batch_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        outputs = {
+            "Y": [sync_batch_norm_out],
+            "MeanOut": [mean_out],
+            "VarianceOut": [variance_out],
+            "SavedMean": [saved_mean],
+            "SavedVariance": [saved_variance]
+        }
+
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
+        return sync_batch_norm_out