fix APIs, update norm op, test=develop (#20119)

* update norm op, test=develop, test=document_fix * fix norm api, test=develop, test=document_fix

fix APIs, update norm op, test=develop (#20119)
* update norm op, test=develop, test=document_fix * fix norm api, test=develop, test=document_fix
9ca417f1 · zhongpu · Jiabin Yang · 3833b511 · 9ca417f1 · 9ca417f1
Showing with 157 addition and 146 deletion

paddle/fluid/API.spec paddle/fluid/API.spec +5 -5

python/paddle/fluid/dygraph/nn.py python/paddle/fluid/dygraph/nn.py +111 -107

python/paddle/fluid/layers/nn.py python/paddle/fluid/layers/nn.py +41 -34

未找到文件。
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -189,7 +189,7 @@ paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'pa
 paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096'))
 paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a6477957b44907787b3c74157400b80c'))
 paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
-paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '79797f827d89ae72c77960e9696883a9'))
+paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '678de6d6d0c93da74189990b039daae8'))
 paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '87dd4b818f102bc1a780e1804c28bd38'))
 paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '7b3d14d6707d878923847ec617d7d521'))
 paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '54e1675aa0364f4a78fa72804ec0f413'))
@@ -660,7 +660,7 @@ paddle.fluid.dygraph.FC.set_dict (ArgSpec(args=['self', 'stat_dict', 'include_su
 paddle.fluid.dygraph.FC.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.FC.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.FC.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BatchNorm ('paddle.fluid.dygraph.nn.BatchNorm', ('document', '390fb9b986423ec6680731ffc7cf24ab'))
+paddle.fluid.dygraph.BatchNorm ('paddle.fluid.dygraph.nn.BatchNorm', ('document', 'f26599d75e3eba36c5dd3224a33009d8'))
 paddle.fluid.dygraph.BatchNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'num_channels', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'dtype', 'data_layout', 'in_place', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats', 'trainable_statistics'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'float32', 'NCHW', False, None, None, False, False, False, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.BatchNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.BatchNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -711,7 +711,7 @@ paddle.fluid.dygraph.GRUUnit.set_dict (ArgSpec(args=['self', 'stat_dict', 'inclu
 paddle.fluid.dygraph.GRUUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.GRUUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.GRUUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.LayerNorm ('paddle.fluid.dygraph.nn.LayerNorm', ('document', '8bc39f59fe2d3713bc143fdf1222a63b'))
+paddle.fluid.dygraph.LayerNorm ('paddle.fluid.dygraph.nn.LayerNorm', ('document', '0d4e428afdc5a3c989ec3270967c3263'))
 paddle.fluid.dygraph.LayerNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.LayerNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.LayerNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -813,7 +813,7 @@ paddle.fluid.dygraph.Conv3DTranspose.set_dict (ArgSpec(args=['self', 'stat_dict'
 paddle.fluid.dygraph.Conv3DTranspose.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.Conv3DTranspose.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.Conv3DTranspose.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GroupNorm ('paddle.fluid.dygraph.nn.GroupNorm', ('document', '72c125b07bdd1e612607dc77039b2722'))
+paddle.fluid.dygraph.GroupNorm ('paddle.fluid.dygraph.nn.GroupNorm', ('document', 'fb75d41f9f6aa895557caf5315d876cc'))
 paddle.fluid.dygraph.GroupNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.GroupNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.GroupNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -830,7 +830,7 @@ paddle.fluid.dygraph.GroupNorm.set_dict (ArgSpec(args=['self', 'stat_dict', 'inc
 paddle.fluid.dygraph.GroupNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.GroupNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.GroupNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.SpectralNorm ('paddle.fluid.dygraph.nn.SpectralNorm', ('document', '8f5cfbc431a8b4b44b605cde8b0381ef'))
+paddle.fluid.dygraph.SpectralNorm ('paddle.fluid.dygraph.nn.SpectralNorm', ('document', '20a09e11c24d6a96fbb98bce3800bebb'))
 paddle.fluid.dygraph.SpectralNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.SpectralNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.SpectralNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1049,20 +1049,18 @@ class FC(layers.Layer):
 class BatchNorm(layers.Layer):
    """
-    **Batch Normalization Layer**
+    This interface is used to construct a callable object of the ``BatchNorm`` class.
+    For more details, refer to code examples.
-    Can be used as a normalizer function for conv2d and fully_connected operations.
+    It implements the function of the Batch Normalization Layer and can be used 
-    The required data format for this layer is one of the following:
+    as a normalizer function for conv2d and fully connected operations.
+    The data is normalized by the mean and variance of the channel based on the current batch data.
-    1. NHWC `[batch, in_height, in_width, in_channels]`
-    2. NCHW `[batch, in_channels, in_height, in_width]`
    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.
-    :math:`input` is the input features over a mini-batch.
+    When use_global_stats = False, the :math:`\\mu_{\\beta}` 
+    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
    ..  math::
@@ -1070,70 +1068,79 @@ class BatchNorm(layers.Layer):
        \ mini-batch\ mean \\\\
        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+    - :math:`x` : mini-batch data
+    - :math:`m` : the size of the mini-batch data
    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
-    They are global (or running) statistics. (It usually got from the
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
-    pre-trained model.)
+    pre-trained model. Calculated as follows:
-    The training and testing (or inference) have the same behavior:
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+    The normalization function formula is as follows:
    ..  math::
        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\\beta` : trainable deviation parameter
    Parameters:
        name_scope(str): The name of this class.
-        act(str|None): Activation type, linear|relu|prelu|...
+        num_channels(int): Indicate the number of channels of the input ``Tensor``.
-        is_test (bool): A flag indicating whether it is in
+        act(str, optional): Activation to be applied to the output of batch normalizaiton. Default: None.
-            test phrase or not. Default: False
+        is_test (bool, optional): A flag indicating whether it is in test phrase or not. Default: False.
-        momentum(float): The value used for the moving_mean and
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-            moving_var computation. The updated formula is:
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
-            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
-            Default is 0.9.
-        epsilon(float): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. Default: None.
-        data_layout(string): NCHW|NHWC. Default: NCHW
+        dtype(str, optional): Indicate the data type of the input ``Tensor``,
-        in_place(bool): Make the input and output of batch norm reuse memory. Default: False
+             which can be float32 or float64. Default: float32.
-        moving_mean_name(string|None): The name of moving_mean which store the global Mean. Default: None
+        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
-        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
-        fuse_with_relu (bool): if True, this OP performs relu after batch norm. Default: False
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
-        use_global_stats(bool): Whether to use global mean and
+        do_model_average_for_mean_and_var(bool, optional): Do model average for mean and variance or not. Default: False.
+        fuse_with_relu (bool, optional): When setting fuse_with_relu True, this OP performs relu after batch norm. 
+            Default: False.
+        use_global_stats(bool, optional): Whether to use global mean and
            variance. In inference or test mode, set use_global_stats to true
            or is_test to true, and the behavior is equivalent.
            In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period. Default: False
+            and variance are also used during train period. Default: False.
-        trainable_statistics(bool): Whether to calculate mean and var in eval mode. In eval mode, when
+        trainable_statistics(bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.Default: False
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            Default: False.
    Returns:
-        Variable: A tensor variable which is the result after applying batch normalization on the input.
+        None
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
+          import numpy as np
+          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
          with fluid.dygraph.guard():
-              fc = fluid.FC('fc', size=200, param_attr='fc1.w')
+              x = to_variable(x)
-              hidden1 = fc(x)
              batch_norm = fluid.BatchNorm("batch_norm", 10)
-              hidden2 = batch_norm(hidden1)
+              hidden1 = batch_norm(x)
    """
    def __init__(self,
@@ -1363,70 +1370,66 @@ class Embedding(layers.Layer):
 class LayerNorm(layers.Layer):
    """
-    Assume feature vectors exist on dimensions
+    This interface is used to construct a callable object of the ``LayerNorm`` class.
-    `begin_norm_axis ... rank(input)` and calculate the moment statistics along these dimensions for each feature
+    For more details, refer to code examples.
-    vector `a` with size `H`, then normalize each feature vector using the corresponding
+    It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if `scale` and `shift` are set.
    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
    The formula is as follows:
    ..  math::
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
-        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
-    * :math:`a`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
-    * :math:`H`: the number of hidden units in a layers
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
-    * :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
-    * :math:`b`: the trainable bias parameter.
    Parameters:
        name_scope(str): The name of this class.
-        scale(bool): Whether to learn the adaptive gain :math:`g` after
+        scale(bool, optional): Whether to learn the adaptive gain :math:`g` after
            normalization. Default: True.
-        shift(bool): Whether to learn the adaptive bias :math:`b` after
+        shift(bool, optional): Whether to learn the adaptive bias :math:`b` after
            normalization. Default: True.
-        begin_norm_axis(int): The normalization will be performed along
+        begin_norm_axis(int, optional): The normalization will be performed along
            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
            Default: 1.
-        epsilon(float): The small value added to the variance to prevent
+        epsilon(float, optional): The small value added to the variance to prevent
            division by zero. Default: 1e-05.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
            a default :code:`ParamAttr` would be added as scale. The
            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
            a default :code:`ParamAttr` would be added as bias. The
            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str): Activation to be applied to the output of layer normalizaiton.
+        act(str, optional): Activation to be applied to the output of layer normalizaiton.
                  Default: None.
    Returns:
-        Result after normalization
+        None
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
+          from paddle.fluid.dygraph.base import to_variable
          import numpy
+          x = numpy.random.random((3, 32, 32)).astype('float32')
          with fluid.dygraph.guard():
-              x = numpy.random.random((3, 32, 32)).astype('float32')
+              x = to_variable(x)
-              layerNorm = fluid.dygraph.nn.LayerNorm(
+              layerNorm = fluid.LayerNorm('LayerNorm', begin_norm_axis=1)
-                    'LayerNorm', begin_norm_axis=1)
+              ret = layerNorm(x)
-             ret = layerNorm(fluid.dygraph.base.to_variable(x))
    """
@@ -2562,37 +2565,38 @@ class RowConv(layers.Layer):
 class GroupNorm(layers.Layer):
    """
-        **Group Normalization Layer**
+    This interface is used to construct a callable object of the ``GroupNorm`` class.
+    For more details, refer to code examples.
-        Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
+    It implements the function of the Group Normalization Layer.
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
-        Parameters:
+    Parameters:
-            name_scope(str): The name of this class.
+        name_scope(str): The name of this class.
-            groups(int): The number of groups that divided from channels.
+        groups(int): The number of groups that divided from channels.
-            epsilon(float): The small value added to the variance to prevent
+        epsilon(float, optional): The small value added to the variance to prevent
-                division by zero. Default: 1e-05.
+                                  division by zero. Default: 1e-05.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
-                scale :math:`g`. If it is set to False, no scale will be added to the output units.
+                                         scale :math:`g`. If it is set to False, no scale will be added to the output units.
-                If it is set to None, the bias is initialized one. Default: None.
+                                         If it is set to None, the bias is initialized one. Default: None.
-            bias_attr(ParamAttr|None): The parameter attribute for the learnable
+        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
-                bias :math:`b`. If it is set to False, no bias will be added to the output units.
+                                        bias :math:`b`. If it is set to False, no bias will be added to the output units.
-                If it is set to None, the bias is initialized zero. Default: None.
+                                        If it is set to None, the bias is initialized zero. Default: None.
-            act(str): Activation to be applied to the output of group normalizaiton.
+        act(str, optional): Activation to be applied to the output of group normalizaiton. Default: None.
-            data_layout(string|NCHW): Only NCHW is supported.
+        data_layout(str, optional): Specify the input data format. Only NCHW is supported. Default: NCHW.
-        Returns:
+    Returns:
-            Variable: A tensor variable which is the result after applying group normalization on the input.
+        None
-        Examples:
+    Examples:
-            .. code-block:: python
+        .. code-block:: python
-              import paddle.fluid as fluid
+          import paddle.fluid as fluid
-              import numpy
+          import numpy as np
-              with fluid.dygraph.guard():
+          with fluid.dygraph.guard():
-                  x = numpy.random.random((8, 32, 32)).astype('float32')
+              x = np.random.random((8, 32, 32)).astype('float32')
-                  groupNorm = fluid.dygraph.nn.GroupNorm('GroupNorm', groups=4)
+              groupNorm = fluid.dygraph.nn.GroupNorm('GroupNorm', groups=4)
-                  ret = groupNorm(fluid.dygraph.base.to_variable(x))
+              ret = groupNorm(fluid.dygraph.base.to_variable(x))
    """
@@ -2661,8 +2665,8 @@ class GroupNorm(layers.Layer):
 class SpectralNorm(layers.Layer):
    """
-    **Spectral Normalization Layer**
+    This interface is used to construct a callable object of the ``SpectralNorm`` class.
+    For more details, refer to code examples. It implements the function of the Spectral Normalization Layer.
    This layer calculates the spectral normalization value of weight parameters of
    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
    Parameters. Calculations are showed as follows.
@@ -2696,22 +2700,22 @@ class SpectralNorm(layers.Layer):
    Parameters:
        name_scope(str): The name of this class.
-        dim(int): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
+        dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
-        power_iters(int): The number of power iterations to calculate spectral norm. Default: 1.
+        power_iters(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
-        eps(float): The epsilon for numerical stability in calculating norms. Default: 1e-12.
+        eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
-        name (str): The name of this layer. It is optional.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
    Returns:
-        Variable: A tensor variable of weight parameters after spectral normalization.
+        None
    Examples:
       .. code-block:: python
            import paddle.fluid as fluid
-            import numpy
+            import numpy as np
            with fluid.dygraph.guard():
-                x = numpy.random.random((2, 8, 32, 32)).astype('float32')
+                x = np.random.random((2, 8, 32, 32)).astype('float32')
                spectralNorm = fluid.dygraph.nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
                ret = spectralNorm(fluid.dygraph.base.to_variable(x))

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4443,62 +4443,69 @@ def layer_norm(input,
               act=None,
               name=None):
    """
-    ${comment}
+    **Layer Normalization Layer**
+    The API implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
+    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
    The formula is as follows:
    ..  math::
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
-        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
-    * :math:`a`: the vector representation of the summed inputs to the neurons
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
-    in that layer.
-    * :math:`H`: the number of hidden units in a layers
+        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
-    * :math:`g`: the trainable scale parameter.
+    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
+    - :math:`H`: the number of hidden units in a layers
-    * :math:`b`: the trainable bias parameter.
+    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`g`: the trainable scale parameter.
+    - :math:`b`: the trainable bias parameter.
    Args:
-        input(Variable): The input tensor variable.
+        input(Variable): A multi-dimension ``Tensor`` , and the data type is float32 or float64.
-        scale(bool): Whether to learn the adaptive gain :math:`g` after
+        scale(bool, optional): Whether to learn the adaptive gain :math:`g` after
-            normalization. Default True.
+            normalization. Default: True.
-        shift(bool): Whether to learn the adaptive bias :math:`b` after
+        shift(bool, optional): Whether to learn the adaptive bias :math:`b` after
-            normalization. Default True.
+            normalization. Default: True.
-        begin_norm_axis(int): The normalization will be performed along
+        begin_norm_axis(int, optional): The normalization will be performed along
            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-            Default 1.
+            Default: 1.
-        epsilon(float): The small value added to the variance to prevent
+        epsilon(float, optional): The small value added to the variance to prevent
-            division by zero. Default 1e-05.
+            division by zero. Default: 1e-05.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
            a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default None.
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+        bias_attr(ParamAttr, optional): The parameter attribute for the learnable
            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
            a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
-        act(str): Activation to be applied to the output of layer normalizaiton.
+        act(str, optional): Activation to be applied to the output of layer normalizaiton.
-                  Default None.
+                  Default: None.
-        name(str): The name of this layer. It is optional. Default None, and a
+        name(str): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
-                   unique name would be generated automatically.
    Returns:
-        ${y_comment}
+        Variable: ``Tensor``  indicating the normalized result, the data type is the same as  ``input`` , and the return dimension is the same as  ``input`` .
    Examples:
-        >>> import paddle.fluid as fluid
+        .. code-block:: python
-        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
-        >>>                          dtype='float32')
+            import paddle.fluid as fluid
-        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+            import numpy as np
+            x = fluid.data(name='x', shape=[-1, 32, 32], dtype='float32')
+            hidden1 = fluid.layers.layer_norm(input=x, begin_norm_axis=1)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            np_x = np.random.random(size=(8, 3, 32, 32)).astype('float32')
+            output = exe.run(feed={"x": np_x}, fetch_list = [hidden1])
+            print(output)
    """
    assert in_dygraph_mode(
    ) is not True, "please use FC instead of fc in dygraph mode!"