[remove fluid] drop_out API (#48586)

* [remove fluid] drop_out PI * [remove fluid] drop_out PI * [remove fluid] drop_out layernorm * [remove fluid] drop_out layernorm * [remove fluid] drop_out layernorm * [remove fluid] drop_out layernorm

[remove fluid] drop_out API (#48586)
* [remove fluid] drop_out PI * [remove fluid] drop_out PI * [remove fluid] drop_out layernorm * [remove fluid] drop_out layernorm * [remove fluid] drop_out layernorm * [remove fluid] drop_out layernorm
6af7b42b · wangzhen38 · GitHub · 23299c70 · 6af7b42b · 6af7b42b
11 changed file
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -53,10 +53,8 @@ __all__ = [
    'Pool2D',
    'Linear',
    'BatchNorm',
-    'Dropout',
    'Embedding',
    'GRUUnit',
-    'LayerNorm',
    'NCE',
    'PRelu',
    'BilinearTensorProduct',
@@ -1184,124 +1182,6 @@ class BatchNorm(layers.Layer):
        return self._helper.append_activation(batch_norm_out, self._act)


-class Dropout(layers.Layer):
-    """
-    This interface is used to construct a callable object of the ``Dropout`` class.
-    For more details, refer to code examples.
-
-    Drop or keep each element of input independently. Dropout is a regularization
-    technique for reducing overfitting by preventing neuron co-adaption during
-    training. The dropout operator randomly sets (according to the given dropout
-    probability) the outputs of some units to zero, while others are remain
-    unchanged.
-
-    Dropout layer can be removed for efficiency concern.
-
-    Parameters:
-        p (float, optional): Probability of setting units to zero. Default: 0.5
-        seed (int, optional): A Python integer used to create random seeds. If this
-                    parameter is set to None, a random seed is used.
-                    NOTE: If an integer seed is given, always the same output
-                    units will be dropped. DO NOT use a fixed seed in training. Default: None.
-        dropout_implementation(string, optional): ['downgrade_in_infer'(default)|'upscale_in_train']
-
-                                        1. downgrade_in_infer(default), downgrade the outcome at inference
-
-                                           - train: out = input * mask
-                                           - inference: out = input * (1.0 - p)
-
-                                           (mask is a tensor same shape with input, value is 0 or 1
-                                           ratio of 0 is dropout_prob)
-                                        2. upscale_in_train, upscale the outcome at training time
-
-                                           - train: out = input * mask / ( 1.0 - p )
-                                           - inference: out = input
-
-                                           (mask is a tensor same shape with input, value is 0 or 1
-                                           ratio of 0 is p)
-        is_test (bool, optional): A flag indicating whether it is in test phrase or not.
-                    This flag only has effect on static graph mode. For dygraph mode, please use ``eval()``.
-                    Default: False.
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph.base import to_variable
-            import numpy as np
-
-            x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
-            with fluid.dygraph.guard():
-                x = to_variable(x)
-                m = fluid.dygraph.Dropout(p=0.5)
-                droped_train = m(x)
-                # switch to eval mode
-                m.eval()
-                droped_eval = m(x)
-    """
-
-    def __init__(
-        self,
-        p=0.5,
-        seed=None,
-        dropout_implementation="downgrade_in_infer",
-        is_test=False,
-    ):
-        super().__init__()
-        assert isinstance(p, (float, int)), "p argument should be a number"
-        assert 0 <= p <= 1, "p argument should between 0 and 1"
-        self._dropout_prob = p
-        assert seed is None or isinstance(
-            seed, int
-        ), "seed argument should be None or a integer"
-        self._seed = seed
-        assert dropout_implementation in (
-            'downgrade_in_infer',
-            'upscale_in_train',
-        ), "dropout_implementation argument should be 'downgrade_in_infer' or 'upscale_in_train'"
-        self._dropout_implementation = dropout_implementation
-        self._is_test = is_test
-
-    def forward(self, input):
-        # fast return for p == 0
-        if self._dropout_prob == 0:
-            return input
-        prog = default_main_program()
-        if (self._seed is None or self._seed == 0) and prog.random_seed != 0:
-            self._seed = prog.random_seed
-        attrs = {
-            'dropout_prob': self._dropout_prob,
-            'is_test': not self.training
-            if _non_static_mode()
-            else self._is_test,
-            'fix_seed': self._seed is not None,
-            'seed': self._seed if self._seed is not None else 0,
-            'dropout_implementation': self._dropout_implementation,
-        }
-
-        if _non_static_mode():
-            attrs = sum(attrs.items(), ())
-            out, mask = _legacy_C_ops.dropout(input, *attrs)
-            return out
-
-        out = self._helper.create_variable_for_type_inference(dtype=input.dtype)
-        mask = self._helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
-        )
-
-        self._helper.append_op(
-            type='dropout',
-            inputs={'X': [input]},
-            outputs={'Out': [out], 'Mask': [mask]},
-            attrs=attrs,
-        )
-        return out
-
-
 class Embedding(layers.Layer):
    r"""
    :alias_main: paddle.nn.Embedding
@@ -1483,214 +1363,6 @@ class Embedding(layers.Layer):
        return out


-class LayerNorm(layers.Layer):
-    r"""
-    :alias_main: paddle.nn.LayerNorm
-        :alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
-        :old_api: paddle.fluid.dygraph.LayerNorm
-
-    This interface is used to construct a callable object of the ``LayerNorm`` class.
-    For more details, refer to code examples.
-    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
-
-    The formula is as follows:
-
-    ..  math::
-
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
-
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
-
-        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
-
-    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
-    - :math:`H`: the number of hidden units in a layers
-    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
-    - :math:`g`: the trainable scale parameter.
-    - :math:`b`: the trainable bias parameter.
-
-    Parameters:
-        normalized_shape(int or list or tuple): Input shape from an expected input of
-            size :math:`[*, normalized_shape[0], normalized_shape[1], ..., normalized_shape[-1]]`.
-            If it is a single integer, this module will normalize over the last dimension
-            which is expected to be of that specific size.
-        scale(bool, optional): Whether to learn the adaptive gain :math:`g` after
-            normalization. Default: True.
-        shift(bool, optional): Whether to learn the adaptive bias :math:`b` after
-            normalization. Default: True.
-        epsilon(float, optional): The small value added to the variance to prevent
-            division by zero. Default: 1e-05.
-        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
-            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
-            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
-            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-            a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str, optional): Activation to be applied to the output of layer normalization.
-                  Default: None.
-        dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
-
-    Returns:
-        None
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.base import to_variable
-          import numpy
-
-          x = numpy.random.random((3, 32, 32)).astype('float32')
-          with fluid.dygraph.guard():
-              x = to_variable(x)
-              layerNorm = fluid.LayerNorm([32, 32])
-              ret = layerNorm(x)
-
-    """
-
-    def __init__(
-        self,
-        normalized_shape,
-        scale=True,
-        shift=True,
-        epsilon=1e-05,
-        param_attr=None,
-        bias_attr=None,
-        act=None,
-        dtype='float32',
-    ):
-        super().__init__()
-        if isinstance(normalized_shape, numbers.Integral):
-            normalized_shape = [normalized_shape]
-
-        self._normalized_shape = list(normalized_shape)
-        self._scale = scale
-        self._shift = shift
-        self._epsilon = epsilon
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self._dtype = dtype
-        param_shape = [np.prod(self._normalized_shape)]
-        if self._scale:
-            self.weight = self.create_parameter(
-                attr=self._param_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                default_initializer=Constant(1.0),
-            )
-        else:
-            if self._param_attr:
-                logging.warn("param_attr are only available with scale is True")
-            self.weight = None
-
-        if self._shift:
-            assert self._bias_attr is not False
-            self.bias = self.create_parameter(
-                attr=self._bias_attr,
-                shape=param_shape,
-                dtype=self._dtype,
-                is_bias=True,
-            )
-        else:
-            if self._bias_attr:
-                logging.warn("bias_attr are only available with shift is True")
-            self.bias = None
-
-    def forward(self, input):
-        input_shape = list(input.shape)
-        input_ndim = len(input_shape)
-        normalized_ndim = len(self._normalized_shape)
-        self._begin_norm_axis = input_ndim - normalized_ndim
-        if (
-            input_ndim < normalized_ndim
-            or input_shape[self._begin_norm_axis :] != self._normalized_shape
-        ):
-            str_normalized_shape = str(self._normalized_shape)
-            raise ValueError(
-                'Given normalized_shape is '
-                + str_normalized_shape
-                + ', expected input with shape [*, '
-                + str_normalized_shape[1:]
-                + ', but got input shape '
-                + str(input_shape)
-            )
-
-        if _non_static_mode():
-            if in_dygraph_mode():
-                pre_act, _, _, = _C_ops.layer_norm(
-                    input,
-                    self.weight,
-                    self.bias,
-                    self._epsilon,
-                    self._begin_norm_axis,
-                )
-                return dygraph_utils._append_activation_in_dygraph(
-                    pre_act, act=self._act
-                )
-            else:
-                pre_act, _, _ = _legacy_C_ops.layer_norm(
-                    input,
-                    self.weight,
-                    self.bias,
-                    'epsilon',
-                    self._epsilon,
-                    'begin_norm_axis',
-                    self._begin_norm_axis,
-                )
-                return dygraph_utils._append_activation_in_dygraph(
-                    pre_act, act=self._act
-                )
-
-        check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'LayerNorm'
-        )
-
-        inputs = dict()
-        inputs['X'] = [input]
-        if self._scale:
-            inputs['Scale'] = [self.weight]
-        if self._shift:
-            inputs['Bias'] = [self.bias]
-        attrs = {
-            "epsilon": self._epsilon,
-            "begin_norm_axis": self._begin_norm_axis,
-        }
-
-        # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True
-        )
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True
-        )
-        layer_norm_out = self._helper.create_variable_for_type_inference(
-            self._dtype
-        )
-
-        self._helper.append_op(
-            type="layer_norm",
-            inputs=inputs,
-            outputs={
-                "Y": layer_norm_out,
-                "Mean": mean_out,
-                "Variance": variance_out,
-            },
-            attrs={
-                "epsilon": self._epsilon,
-                "begin_norm_axis": self._begin_norm_axis,
-            },
-        )
-
-        return self._helper.append_activation(layer_norm_out, act=self._act)
-
-
 class GRUUnit(layers.Layer):
    """
    **GRU unit layer**

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -18,13 +18,7 @@ from test_dist_base import TestParallelDyGraphRunnerBase, runtime_main
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import (
-    Embedding,
-    Layer,
-    LayerNorm,
-    Linear,
-    to_variable,
-)
+from paddle.fluid.dygraph import Embedding, Layer, Linear, to_variable
 from paddle.optimizer.lr import NoamDecay

 """
@@ -245,9 +239,9 @@ class PrePostProcessLayer(Layer):
        super().__init__()
        for cmd in process_cmd:
            if cmd == "n":
-                self._layer_norm = LayerNorm(
+                self._layer_norm = paddle.nn.LayerNorm(
                    normalized_shape=d_model,
-                    param_attr=fluid.ParamAttr(
+                    weight_attr=fluid.ParamAttr(
                        initializer=fluid.initializer.Constant(1.0)
                    ),
                    bias_attr=fluid.ParamAttr(

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -18,7 +18,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Embedding, Layer, LayerNorm, to_variable
+from paddle.fluid.dygraph import Embedding, Layer, to_variable
 from paddle.fluid.layers.utils import map_structure
 from paddle.jit.api import dygraph_to_static_func
 from paddle.nn import Linear
@@ -59,9 +59,9 @@ class PrePostProcessLayer(Layer):
                    self.add_sublayer(
                        "layer_norm_%d"
                        % len([layer for layer in self.children()]),
-                        LayerNorm(
+                        paddle.nn.LayerNorm(
                            normalized_shape=d_model,
-                            param_attr=fluid.ParamAttr(
+                            weight_attr=fluid.ParamAttr(
                                initializer=fluid.initializer.Constant(1.0)
                            ),
                            bias_attr=fluid.ParamAttr(

--- a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
@@ -286,7 +286,7 @@ class TestDygraphLayerNormAPIError(unittest.TestCase):
        with program_guard(Program(), Program()):
            paddle.enable_static()

-            layer_norm = fluid.LayerNorm([32, 32])
+            layer_norm = paddle.nn.LayerNorm([32, 32])
            # the input of LayerNorm must be Variable.
            x1 = np.random.random((3, 32, 32)).astype('float32')
            self.assertRaises(TypeError, layer_norm, x1)

--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -538,7 +538,7 @@ class TestDropoutFAPI(unittest.TestCase):
                res10 = paddle.nn.functional.dropout(
                    x=input, p=1.0, training=True
                )
-                dropout = paddle.fluid.dygraph.Dropout(
+                dropout = paddle.nn.Dropout(
                    p=0,
                )
                res11 = dropout(input)

--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -21,14 +21,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-from paddle.fluid.dygraph.nn import (
-    NCE,
-    BatchNorm,
-    Embedding,
-    GroupNorm,
-    LayerNorm,
-    PRelu,
-)
+from paddle.fluid.dygraph.nn import NCE, BatchNorm, Embedding, GroupNorm, PRelu
 from paddle.nn import Linear


@@ -212,8 +205,8 @@ class TestDygraphLoadStatic(unittest.TestCase):
                    self.emb1 = Embedding([1000, 100])
                    self.emb2 = Embedding([2000, 200])

-                    self.layer_norm_1 = LayerNorm([10])
-                    self.layer_norm_2 = LayerNorm(10)
+                    self.layer_norm_1 = paddle.nn.LayerNorm([10])
+                    self.layer_norm_2 = paddle.nn.LayerNorm(10)

                    self.nce1 = NCE(10000, 100)
                    self.nce2 = NCE(10000, 100)

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -20,7 +20,7 @@ from test_imperative_base import new_program_scope
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid import Embedding, Layer, LayerNorm, core
+from paddle.fluid import Embedding, Layer, core
 from paddle.fluid.dygraph import guard, to_variable
 from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
 from paddle.jit import TracedLayer
@@ -399,9 +399,9 @@ class PrePostProcessLayer(Layer):
        super().__init__()
        for cmd in process_cmd:
            if cmd == "n":
-                self._layer_norm = LayerNorm(
+                self._layer_norm = paddle.nn.LayerNorm(
                    normalized_shape=d_model,
-                    param_attr=fluid.ParamAttr(
+                    weight_attr=fluid.ParamAttr(
                        initializer=fluid.initializer.Constant(1.0)
                    ),
                    bias_attr=fluid.ParamAttr(

--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -375,7 +375,7 @@ class TestDygraphLayerNormAPIError(unittest.TestCase):
        with program_guard(Program(), Program()):
            paddle.enable_static()

-            layer_norm = fluid.LayerNorm([32, 32])
+            layer_norm = paddle.nn.LayerNorm([32, 32])
            # the input of LayerNorm must be Variable.
            x1 = np.random.random((3, 32, 32)).astype('float32')
            self.assertRaises(TypeError, layer_norm, x1)

--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -33,7 +33,7 @@ class TestDygraphLayerNormv2(unittest.TestCase):

            def compute_v1(x):
                with fluid.dygraph.guard(p):
-                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    ln = paddle.nn.LayerNorm(shape[1:])
                    y = ln(paddle.to_tensor(x))
                return y.numpy()

@@ -57,7 +57,7 @@ class TestDygraphLayerNormv2(unittest.TestCase):

            def compute_v1(x):
                with fluid.dygraph.guard(p):
-                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    ln = paddle.nn.LayerNorm(shape[1:])
                    x1 = paddle.to_tensor(x)
                    x1.stop_gradient = False
                    y = ln(x1)
@@ -91,7 +91,7 @@ class TestDygraphLayerNormv2(unittest.TestCase):

            def compute_v1(x_np):
                with program_guard(Program(), Program()):
-                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    ln = paddle.nn.LayerNorm(shape[1:])
                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                    y = ln(x)
                    exe.run(fluid.default_startup_program())
@@ -123,7 +123,7 @@ class TestLayerNormFunction(unittest.TestCase):

            def compute_v0(x):
                with fluid.dygraph.guard(p):
-                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    ln = paddle.nn.LayerNorm(shape[1:])
                    y = ln(paddle.to_tensor(x))
                return y.numpy()

@@ -141,7 +141,7 @@ class TestLayerNormFunction(unittest.TestCase):

            def compute_v3(x):
                with fluid.dygraph.guard(p):
-                    ln = fluid.dygraph.LayerNorm(shape[-1])
+                    ln = paddle.nn.LayerNorm(shape[-1])
                    y = ln(paddle.to_tensor(x))
                return y.numpy()


--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -120,50 +120,6 @@ class TestLayer(LayerTest):
            ret = custom(x, do_linear2=True)
            np.testing.assert_array_equal(ret.numpy().shape, [3, 1])

-    def test_dropout(self):
-        inp = np.ones([3, 32, 32], dtype='float32')
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False,
-            )
-            dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
-            ret = dropout(t)
-            ret2 = fluid.layers.dropout(
-                t, dropout_prob=0.35, seed=1, is_test=False
-            )
-            static_ret, static_ret2 = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret, ret2]
-            )
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                t = base.to_variable(inp)
-                dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
-                dy_eager_ret = dropout(t)
-                dy_eager_ret2 = fluid.layers.dropout(
-                    t, dropout_prob=0.35, seed=1, is_test=False
-                )
-                dy_eager_ret_value = dy_eager_ret.numpy()
-                dy_eager_ret2_value = dy_eager_ret2.numpy()
-
-            t = base.to_variable(inp)
-            dropout = nn.Dropout(p=0.35, seed=1, is_test=False)
-            dy_ret = dropout(t)
-            dy_ret2 = fluid.layers.dropout(
-                t, dropout_prob=0.35, seed=1, is_test=False
-            )
-            dy_ret_value = dy_ret.numpy()
-            dy_ret2_value = dy_ret2.numpy()
-
-        np.testing.assert_array_equal(dy_eager_ret_value, dy_eager_ret2_value)
-        np.testing.assert_array_equal(static_ret, dy_eager_ret_value)
-
-        np.testing.assert_array_equal(static_ret, static_ret2)
-        np.testing.assert_array_equal(dy_ret_value, dy_ret2_value)
-        np.testing.assert_array_equal(static_ret, dy_ret_value)
-
    def test_linear(self):
        inp = np.ones([3, 32, 32], dtype='float32')
        with self.static_graph():
@@ -284,107 +240,6 @@ class TestLayer(LayerTest):

            self.assertRaises(TypeError, test_type)

-    def test_layer_norm(self):
-        inp = np.ones([3, 32, 32], dtype='float32')
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False,
-            )
-            ret = layers.layer_norm(
-                t,
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
-            )
-            static_ret = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret]
-            )[0]
-        with self.static_graph():
-            t = layers.data(
-                name='data',
-                shape=[3, 32, 32],
-                dtype='float32',
-                append_batch_size=False,
-            )
-            lm = nn.LayerNorm(
-                normalized_shape=[32, 32],
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
-            )
-            ret = lm(t)
-            static_ret2 = self.get_static_graph_result(
-                feed={'data': inp}, fetch_list=[ret]
-            )[0]
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                lm = nn.LayerNorm(
-                    normalized_shape=[32, 32],
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                    act='sigmoid',
-                )
-                dy_eager_ret = lm(base.to_variable(inp))
-                dy_eager_ret_value = dy_eager_ret.numpy()
-
-            lm = nn.LayerNorm(
-                normalized_shape=[32, 32],
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
-            )
-            dy_ret = lm(base.to_variable(inp))
-            dy_ret_value = dy_ret.numpy()
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                lm = nn.LayerNorm(
-                    normalized_shape=[32, 32],
-                    shift=False,
-                    scale=False,
-                    param_attr=fluid.initializer.ConstantInitializer(value=1),
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                    act='sigmoid',
-                )
-                lm(base.to_variable(inp))
-
-                self.assertFalse(hasattr(lm, "_scale_w"))
-                self.assertFalse(hasattr(lm, "_bias_w"))
-
-            lm = nn.LayerNorm(
-                normalized_shape=[32, 32],
-                shift=False,
-                scale=False,
-                param_attr=fluid.initializer.ConstantInitializer(value=1),
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
-            )
-            lm(base.to_variable(inp))
-
-            self.assertFalse(hasattr(lm, "_scale_w"))
-            self.assertFalse(hasattr(lm, "_bias_w"))
-
-        np.testing.assert_array_equal(static_ret, static_ret2)
-        np.testing.assert_array_equal(dy_eager_ret_value, static_ret2)
-        np.testing.assert_array_equal(dy_ret_value, static_ret2)
-
-        with self.dynamic_graph():
-            with _test_eager_guard():
-                lm = nn.LayerNorm(
-                    normalized_shape=[16, 32],
-                    bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                    act='sigmoid',
-                )
-                with self.assertRaises(ValueError):
-                    lm(base.to_variable(inp))
-
-            lm = nn.LayerNorm(
-                normalized_shape=[16, 32],
-                bias_attr=fluid.initializer.ConstantInitializer(value=1),
-                act='sigmoid',
-            )
-            with self.assertRaises(ValueError):
-                lm(base.to_variable(inp))
-
    def test_SyncBatchNorm(self):
        if core.is_compiled_with_cuda():
            with self.static_graph():

--- a/python/paddle/fluid/tests/unittests/test_paddle_fluid_modelaverage.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_fluid_modelaverage.py
@@ -13,9 +13,11 @@
 # limitations under the License.

 import unittest
+
 import numpy as np
-import paddle.fluid as fluid
+
 import paddle
+import paddle.fluid as fluid


 class TestModelAverage(unittest.TestCase):