From fd6631ef2fb8c896f89ddaa94a5112c08665f35b Mon Sep 17 00:00:00 2001
From: lujun <lujun315023@126.com>
Date: Thu, 27 Jun 2019 19:07:41 +0800
Subject: [PATCH] Fix dygraph show style (#18297)

Fix dygraph show style for FluidDoc.
---
 paddle/fluid/API.spec                     |   4 +-
 paddle/fluid/pybind/imperative.cc         |  31 +--
 python/paddle/fluid/dygraph/checkpoint.py |  55 ++--
 python/paddle/fluid/dygraph/nn.py         | 306 ++++++++++++----------
 4 files changed, 210 insertions(+), 186 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e081070eb8..b73f582eb9 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -728,8 +728,8 @@ paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, key
 paddle.fluid.dygraph.start_gperf_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.stop_gperf_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.save_persistables (ArgSpec(args=['model_dict', 'dirname', 'optimizers'], varargs=None, keywords=None, defaults=('save_dir', None)), ('document', 'd264d1d00dcf3c7e957978563369b57f'))
-paddle.fluid.dygraph.load_persistables (ArgSpec(args=['dirname'], varargs=None, keywords=None, defaults=('save_dir',)), ('document', '8a0f69b10754e45907b904aa68f3c5dc'))
+paddle.fluid.dygraph.save_persistables (ArgSpec(args=['model_dict', 'dirname', 'optimizers'], varargs=None, keywords=None, defaults=('save_dir', None)), ('document', '7f526f879139a14cda8e0b5a9171f264'))
+paddle.fluid.dygraph.load_persistables (ArgSpec(args=['dirname'], varargs=None, keywords=None, defaults=('save_dir',)), ('document', '2574d50a7a9f89fb0d74ddf73d8128f0'))
 paddle.fluid.dygraph.NoamDecay.__init__ (ArgSpec(args=['self', 'd_model', 'warmup_steps', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.NoamDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.NoamDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 0d15b9a44d..aaae26cd0c 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -163,21 +163,22 @@ void BindImperative(pybind11::module *m_ptr) {
 
     Examples:
 
-     .. code-block:: python
-        import numpy as np
-        import paddle.fluid as fluid
-        from paddle.fluid import FC
-
-        x = np.ones([2, 2], np.float32)
-        with fluid.dygraph.guard():
-            inputs2 = []
-            for _ in range(10):
-                inputs2.append(fluid.dygraph.base.to_variable(x))
-            ret2 = fluid.layers.sums(inputs2)
-            loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
+        .. code-block:: python
+
+          import numpy as np
+          import paddle.fluid as fluid
+          from paddle.fluid import FC
+
+          x = np.ones([2, 2], np.float32)
+          with fluid.dygraph.guard():
+              inputs2 = []
+              for _ in range(10):
+                  inputs2.append(fluid.dygraph.base.to_variable(x))
+              ret2 = fluid.layers.sums(inputs2)
+              loss2 = fluid.layers.reduce_sum(ret2)
+              backward_strategy = fluid.dygraph.BackwardStrategy()
+              backward_strategy.sort_sum_gradient = True
+              loss2.backward(backward_strategy)
       )DOC");
   backward_strategy.def(py::init())
       .def_property("sort_sum_gradient",
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 5284940555..6bcd94b45c 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -43,35 +43,38 @@ def save_persistables(model_dict, dirname='save_dir', optimizers=None):
         optimizers(fluid.Optimizer|list(fluid.Optimizer)|None): The optimizers to be saved
 
     Returns:
+        None
 
     Examples:
+
         .. code-block:: python
-            ptb_model = PtbModel(
+
+          ptb_model = PtbModel(
                 hidden_size=hidden_size,
                 vocab_size=vocab_size,
                 num_layers=num_layers,
                 num_steps=num_steps,
                 init_scale=init_scale)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            x_data = np.arange(12).reshape(4, 3).astype('int64')
-            y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
-            x_data = x_data.reshape((-1, num_steps, 1))
-            y_data = y_data.reshape((-1, 1))
-            init_hidden_data = np.zeros(
+          sgd = fluid.optimizer.SGD(learning_rate=0.01)
+          x_data = np.arange(12).reshape(4, 3).astype('int64')
+          y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+          x_data = x_data.reshape((-1, num_steps, 1))
+          y_data = y_data.reshape((-1, 1))
+          init_hidden_data = np.zeros(
                 (num_layers, batch_size, hidden_size), dtype='float32')
-            init_cell_data = np.zeros(
+          init_cell_data = np.zeros(
                 (num_layers, batch_size, hidden_size), dtype='float32')
-            x = to_variable(x_data)
-            y = to_variable(y_data)
-            init_hidden = to_variable(init_hidden_data)
-            init_cell = to_variable(init_cell_data)
-            dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+          x = to_variable(x_data)
+          y = to_variable(y_data)
+          init_hidden = to_variable(init_hidden_data)
+          init_cell = to_variable(init_cell_data)
+          dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                         init_cell)
-            dy_loss.backward()
-            sgd.minimize(dy_loss)
-            ptb_model.clear_gradient()
-            param_path = "./my_paddle_model"
-            fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, sgd)
+          dy_loss.backward()
+          sgd.minimize(dy_loss)
+          ptb_model.clear_gradient()
+          param_path = "./my_paddle_model"
+          fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, sgd)
     """
     if isinstance(model_dict, collections.OrderedDict):
         _save_var_to_file(model_dict, optimizers, dirname, None)
@@ -95,13 +98,15 @@ def load_persistables(dirname='save_dir'):
         optimizer dict: The optimizer
 
     Examples:
-        .. code-block:: python
-            my_layer = layer(fluid.Layer)
-            param_path = "./my_paddle_model"
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            param_dict, optimizer_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
-            param_1 = param_dict['PtbModel_0.w_1']
-            sgd.load(optimizer_dict)
+
+         .. code-block:: python
+
+           my_layer = layer(fluid.Layer)
+           param_path = "./my_paddle_model"
+           sgd = SGDOptimizer(learning_rate=1e-3)
+           param_dict, optimizer_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
+           param_1 = param_dict['PtbModel_0.w_1']
+           sgd.load(optimizer_dict)
 
         """
     return _load_var_from_file(dirname)
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 3fa74d78f5..f933e22ddf 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -42,7 +42,7 @@ class Conv2D(layers.Layer):
     and W is the width of the filter. If the groups is greater than 1,
     C will equal the number of input image channels divided by the groups.
     Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
     for more detials.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
@@ -124,11 +124,7 @@ class Conv2D(layers.Layer):
 
     Examples:
         .. code-block:: python
-          
-          with fluid.dygraph.guard():
-             conv2d = Conv2D( "conv2d", 2, 3)
-             data = to_variable( data )
-             conv = conv2d( data )
+
           from paddle.fluid.dygraph.base import to_variable
           import paddle.fluid as fluid
           from paddle.fluid.dygraph import Conv2D
@@ -306,9 +302,8 @@ class Conv3D(layers.Layer):
             W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        input (Variable): The input image with [N, C, D, H, W] format.
-            num_filters(int): The number of filter. It is as same as the output
-            image channel.
+        name_scope(str) : The name for this class.
+        num_filters(int): The number of filter. It is as same as the output image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
             Otherwise, the filter will be a square.
@@ -340,8 +335,6 @@ class Conv3D(layers.Layer):
             library is installed. Default: True
         act (str): Activation type, if it is set to None, activation is not appended.
             Default: None.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -359,7 +352,6 @@ class Conv3D(layers.Layer):
 
           with fluid.dygraph.guard():
               data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-
               conv3d = fluid.dygraph.nn.Conv3D(
                     'Conv3D', num_filters=2, filter_size=3, act="relu")
               ret = conv3d(fluid.dygraph.base.to_variable(data))
@@ -511,7 +503,7 @@ class Conv3DTranspose(layers.Layer):
            W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
 
     Args:
-        input(Variable): The input image with [N, C, D, H, W] format.
+        name_scope(str) : The name for this class.
         num_filters(int): The number of the filter. It is as same as the output
             image channel.
         output_size(int|tuple|None): The output image size. If output size is a
@@ -681,29 +673,31 @@ class Conv3DTranspose(layers.Layer):
 
 
 class Pool2D(layers.Layer):
-    # TODO, should delete this class
     """
-    ${comment}
+    The pooling2d operation calculates the output based on the input, pooling_type and ksize, strides,
+    paddings parameters.Input(X) and output(Out) are in NCHW format, where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+    Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively.
+    The input(X) size and output(Out) size may be different.
 
     Args:
         name_scope(str) : The name of this class.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
-            Otherwise, the pool kernel size will be a square of an int.
-        pool_type: ${pooling_type_comment}
+            Otherwise, the pool kernel size will be a square of an int. Default: -1
+        pool_type(str) : The pooling type, can be "max" for max-pooling and "avg" for average-pooling. Default: max
         pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
-            it must contain two integers, (pool_stride_Height, pool_stride_Width).
-            Otherwise, the pool stride size will be a square of an int.
+            it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise,
+            the pool stride size will be a square of an int. Default: 1
         pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
             it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
-            Otherwise, the pool padding size will be a square of an int.
-        global_pooling (bool): ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
-        name (str|None): A name for this layer(optional). If set None, the
-                        layer will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is true
+            Otherwise, the pool padding size will be a square of an int. Default: 0
+        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
+            kernel size and paddings will be ignored. Default: False
+        use_cudnn (bool): Only used in cudnn kernel, need install cudnn. Default: True
+        ceil_mode (bool): Whether to use the ceil function to calculate output height and width.
+            False is the default. If it is set to False, the floor function will be used. Default: False
+        exclusive (bool): Whether to exclude padding points in average pooling mode. Default: True
 
     Returns:
         Variable: The pooling result.
@@ -717,14 +711,18 @@ class Pool2D(layers.Layer):
 
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          pool2d = fluid.Pool2D("pool2d",pool_size=2,
+          import paddle.fluid as fluid
+          import numpy
+
+          with fluid.dygraph.guard():
+             data = numpy.random.random((3, 32, 32)).astype('float32')
+
+             pool2d = fluid.dygraph.Pool2D("pool2d",pool_size=2,
                             pool_type='max',
                             pool_stride=1,
                             global_pooling=False)
+             pool2d_res = pool2d(data)
 
-          pool2d_res = pool2d(data)
     """
 
     def __init__(self,
@@ -840,9 +838,9 @@ class FC(layers.Layer):
             out.shape = (1, 2)
 
     Args:
-        name(str): The name of this class.
+        name_scope(str): The name of this class.
         size(int): The number of output units in this layer.
-        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+        num_flatten_dims (int): The fc layer can accept an input tensor with more than
             two dimensions. If this happens, the multidimensional tensor will first be flattened
             into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
             tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
@@ -850,14 +848,14 @@ class FC(layers.Layer):
             the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
             form the second dimension of the final matrix (width of the matrix). For example, suppose
             `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
+        param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
             parameters/weights of this layer.
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
             of this layer. If it is set to False, no bias will be added to the output units.
             If it is set to None, the bias is initialized zero. Default: None.
-        act (str, default None): Activation to be applied to the output of this layer.
-        is_test(bool): A flag indicating whether execution is in test phase.
+        act (str|None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase. Default: False
         dtype(str): Dtype used for weight
 
     Raises:
@@ -865,11 +863,12 @@ class FC(layers.Layer):
 
     Examples:
         .. code-block:: python
-        
+
           from paddle.fluid.dygraph.base import to_variable
           import paddle.fluid as fluid
           from paddle.fluid.dygraph import FC
           import numpy as np
+
           data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
           with fluid.dygraph.guard():
               fc = FC( "fc", 64, num_flatten_dims=2)
@@ -1015,16 +1014,16 @@ class BatchNorm(layers.Layer):
         y_i &\\gets \\gamma \\hat{x_i} + \\beta
 
     Args:
-        input(variable): The rank of input variable can be 2, 3, 4, 5.
-        act(string, Default None): Activation type, linear|relu|prelu|...
-        is_test (bool, Default False): A flag indicating whether it is in
-            test phrase or not.
-        momentum(float, Default 0.9): The value used for the moving_mean and
+        name_scope(str): The name of this class.
+        act(str|None): Activation type, linear|relu|prelu|...
+        is_test (bool): A flag indicating whether it is in
+            test phrase or not. Default: False
+        momentum(float): The value used for the moving_mean and
             moving_var computation. The updated formula is:
             :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
             :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
             Default is 0.9.
-        epsilon(float, Default 1e-05): A value added to the denominator for
+        epsilon(float): A value added to the denominator for
             numerical stability. Default is 1e-5.
         param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
@@ -1034,32 +1033,33 @@ class BatchNorm(layers.Layer):
              If it is set to None or one attribute of ParamAttr, batch_norm
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. Default: None.
-        data_layout(string, default NCHW): NCHW|NHWC
-        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        name(string, Default None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
-        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        data_layout(string): NCHW|NHWC. Default: NCHW
+        in_place(bool): Make the input and output of batch norm reuse memory. Default: False
+        moving_mean_name(string|None): The name of moving_mean which store the global Mean. Default: None
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
-        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
-        use_global_stats(bool, Default False): Whether to use global mean and
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm. Default: False
+        use_global_stats(bool): Whether to use global mean and
             variance. In inference or test mode, set use_global_stats to true
             or is_test to true, and the behavior is equivalent.
             In train mode, when setting use_global_stats True, the global mean
-            and variance are also used during train period.
-        trainable_statistics(bool, Default False): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            and variance are also used during train period. Default: False
+        trainable_statistics(bool): Whether to calculate mean and var in eval mode. In eval mode, when
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.Default: False
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
 
     Examples:
-
         .. code-block:: python
-            fc = fluid.FC('fc', size=200, param_attr='fc1.w')
-            hidden1 = fc(x)
-            batch_norm = fluid.BatchNorm("batch_norm", 10)
-            hidden2 = batch_norm(hidden1)
+
+          import paddle.fluid as fluid
+
+          with fluid.dygraph.guard():
+              fc = fluid.FC('fc', size=200, param_attr='fc1.w')
+              hidden1 = fc(x)
+              batch_norm = fluid.BatchNorm("batch_norm", 10)
+              hidden2 = batch_norm(hidden1)
     """
 
     def __init__(self,
@@ -1196,14 +1196,16 @@ class Embedding(layers.Layer):
     All the input variables are passed in as local variables to the LayerHelper constructor
 
     Args:
-        name_scope: See base class.
-        size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size of the dictionary of embeddings and the size of each embedding vector respectively.
-
-        is_sparse(bool): The flag indicating whether to use sparse update.
-        is_distributed(bool): Whether to run lookup table from remote parameter server.
-        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup. Otherwise the given :attr:`padding_idx` indicates padding the output with zeros whenever lookup encounters it in :attr:`input`. If :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is :math:`size[0] + dim`.
-        param_attr(ParamAttr): Parameters for this layer
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
+        name_scope(str): The name of this class.
+        size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size
+            of the dictionary of embeddings and the size of each embedding vector respectively.
+        is_sparse(bool): The flag indicating whether to use sparse update. Default: False
+        is_distributed(bool): Whether to run lookup table from remote parameter server. Default: False.
+        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
+            Otherwise the given :attr:`padding_idx` indicates padding the output with zeros whenever lookup encounters
+            it in :attr:`input`. If :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is :math:`size[0] + dim`. Default: None.
+        param_attr(ParamAttr): Parameters for this layer. Default: None.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc. Default: 'float32'.
 
     Returns:
         Variable: The tensor variable storing the embeddings of the \
@@ -1213,15 +1215,19 @@ class Embedding(layers.Layer):
 
         .. code-block:: python
 
+          import paddle.fluid as fluid
+          import paddle.fluid.dygraph.base as base
+          import numpy as np
+
           inp_word = np.array([[[1]]]).astype('int64')
           dict_size = 20
           with fluid.dygraph.guard():
-              emb = fluid.Embedding(
+              emb = fluid.dygraph.Embedding(
                   name_scope='embedding',
                   size=[dict_size, 32],
                   param_attr='emb.w',
                   is_sparse=False)
-            static_rlt3 = emb2(base.to_variable(inp_word))
+              static_rlt3 = emb(base.to_variable(inp_word))
     """
 
     def __init__(self,
@@ -1232,7 +1238,6 @@ class Embedding(layers.Layer):
                  padding_idx=None,
                  param_attr=None,
                  dtype='float32'):
-
         super(Embedding, self).__init__(name_scope, dtype)
         self._size = size
         self._is_sparse = is_sparse
@@ -1299,28 +1304,28 @@ class LayerNorm(layers.Layer):
     * :math:`b`: the trainable bias parameter.
 
     Args:
-        name_scope (str): See base class.
+        name_scope(str): The name of this class.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
-            normalization. Default True.
+            normalization. Default: True.
         shift(bool): Whether to learn the adaptive bias :math:`b` after
-            normalization. Default True.
+            normalization. Default: True.
         begin_norm_axis(int): The normalization will be performed along
             dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-            Default 1.
+            Default: 1.
         epsilon(float): The small value added to the variance to prevent
-            division by zero. Default 1e-05.
+            division by zero. Default: 1e-05.
         param_attr(ParamAttr|None): The parameter attribute for the learnable
             gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
             omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
             a default :code:`ParamAttr` would be added as scale. The
-            :attr:`param_attr` is initialized as 1 if it is added. Default None.
+            :attr:`param_attr` is initialized as 1 if it is added. Default: None.
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
             bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
             omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
             a default :code:`ParamAttr` would be added as bias. The
-            :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+            :attr:`bias_attr` is initialized as 0 if it is added. Default: None.
         act(str): Activation to be applied to the output of layer normalizaiton.
-                  Default None.
+                  Default: None.
     Returns:
         Result after normalization
 
@@ -1414,7 +1419,7 @@ class GRUUnit(layers.Layer):
 
     if origin_mode is True, then the equation of a gru step is from paper
     `Learning Phrase Representations using RNN Encoder-Decoder for Statistical
-    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`_
+    Machine Translation <https://arxiv.org/pdf/1406.1078.pdf>`
 
         .. math::
             u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
@@ -1452,10 +1457,8 @@ class GRUUnit(layers.Layer):
     and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
 
     Args:
-        input (Variable): The fc transformed input value of current step.
-        name_scope (str): See base class.
-        hidden (Variable): The hidden value of gru unit from previous step.
-        size (integer): The input dimension value.
+        name_scope(str): The name of this class.
+        size (int): The input dimension value.
         param_attr(ParamAttr|None): The parameter attribute for the learnable
             hidden-hidden weight matrix. Note:
 
@@ -1477,14 +1480,34 @@ class GRUUnit(layers.Layer):
             attribute of ParamAttr, gru_unit will create ParamAttr as
             bias_attr. If the Initializer of the bias_attr is not set, the bias
             is initialized zero. Default: None.
-        activation (string): The activation type for cell (actNode).
+        activation (str): The activation type for cell (actNode).
                              Default: 'tanh'
-        gate_activation (string): The activation type for gates (actGate).
+        gate_activation (str): The activation type for gates (actGate).
                                   Default: 'sigmoid'
-        dtype(string): The dtype of the layers
+        dtype(str): The dtype of the layers. Default: 'float32'
 
     Returns:
         tuple: The hidden value, reset-hidden value and gate values.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import paddle.fluid.dygraph.base as base
+          import numpy
+
+          lod = [[2, 4, 3]]
+          D = 5
+          T = sum(lod[0])
+
+          hidden_input = numpy.random.rand(T, D).astype('float32')
+          with fluid.dygraph.guard():
+              x = numpy.random.random((3, 32, 32)).astype('float32')
+              gru = fluid.dygraph.GRUUnit('gru', size=D * 3)
+              dy_ret = gru(
+                base.to_variable(input), base.to_variable(hidden_input))
+
     """
 
     def __init__(self,
@@ -1548,15 +1571,12 @@ class NCE(layers.Layer):
     Compute and return the noise-contrastive estimation training loss. See
     `Noise-contrastive estimation: A new estimation principle for unnormalized
     statistical models
-     <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
+     <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`.
     By default this operator uses a uniform distribution for sampling.
 
     Args:
-        name_scope (str): See base class.
+        name_scope(str): The name of this class.
         num_total_classes (int): Total number of classes in all samples
-        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
-            storing a weight for each sample. The default weight for each
-            sample is 1.0.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of nce. If it is set to None or one attribute of ParamAttr, nce
              will create ParamAttr as param_attr. If the Initializer of the param_attr
@@ -1567,17 +1587,15 @@ class NCE(layers.Layer):
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. Default: None.
         num_neg_samples (int): The number of negative classes. The default value is 10.
-        name (str|None): A name for this layer(optional). If set None, the layer
-             will be named automatically. Default: None.
         sampler (str): The sampler used to sample class from negtive classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
-        custom_dist (float[]): A float[] with size=num_total_classes.
+        custom_dist (float[]|None): A float[] with size=num_total_classes.
                        It is used when sampler is set to 'custom_dist'.
                        custom_dist[i] is the probsbility of i-th class to be sampled.
-                       default: None.
-        seed (int): The seed used in sampler. default: 0.
-        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
+                       Default: None.
+        seed (int): The seed used in sampler. Default: 0.
+        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default: False.
 
     Returns:
         Variable: The output nce loss.
@@ -1786,15 +1804,13 @@ class PRelu(layers.Layer):
         y = \max(0, x) + \\alpha * \min(0, x)
 
     Args:
-        name_scope (str): See base class.
-        mode (string): The mode for weight sharing. It supports all, channel
+        name_scope(str): The name of this class.
+        mode (str): The mode for weight sharing. It supports all, channel
           and element. all: all elements share same weight
           channel:elements in a channel share same weight
           element:each element has a weight
         param_attr(ParamAttr|None): The parameter attribute for the learnable
           weight (alpha).
-        name(str|None): A name for this layer(optional). If set None, the layer
-          will be named automatically.
 
     Returns:
         Variable: The output tensor with the same shape as input.
@@ -1803,14 +1819,18 @@ class PRelu(layers.Layer):
 
         .. code-block:: python
 
-        inp_np = np.ones([5, 200, 100, 100]).astype('float32')
-        with fluid.dygraph.guard():
-            mode = 'channel'
-            prelu = fluid.PRelu(
-                'prelu',
-                mode=mode,
-                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
-            dy_rlt = prelu(fluid.dygraph.base.to_variable(inp_np))
+          import paddle.fluid as fluid
+          import numpy as np
+
+          inp_np = np.ones([5, 200, 100, 100]).astype('float32')
+          with fluid.dygraph.guard():
+              mode = 'channel'
+              prelu = fluid.PRelu(
+                 'prelu',
+                 mode=mode,
+                 param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
+              dy_rlt = prelu(fluid.dygraph.base.to_variable(inp_np))
+
     """
 
     def __init__(self, name_scope, mode, param_attr=None):
@@ -1865,13 +1885,13 @@ class BilinearTensorProduct(layers.Layer):
      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
 
     Args:
-       name_scope (str): See base class.
+       name_scope(str): The name of this class.
        size (int): The dimension of this layer.
-       act (str, default None): Activation to be applied to the output of this layer.
-       name (str, default None): The name of this layer.
-       param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
-           parameters/weights of this layer.
-       bias_attr (ParamAttr, default None): The parameter attribute for the bias
+       act (str): Activation to be applied to the output of this layer. Default: None.
+       name (str): The name of this layer. Default: None.
+       param_attr (ParamAttr): The parameter attribute for the learnable w.
+           parameters/weights of this layer. Default: None.
+       bias_attr (ParamAttr): The parameter attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
            If it is set to None, the bias is initialized zero. Default: None.
 
@@ -2000,18 +2020,18 @@ class Conv2DTranspose(layers.Layer):
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
 
     Args:
-        name_scope (str): See base class.
+        name_scope(str): The name of this class.
         num_filters(int): The number of the filter. It is as same as the output
             image channel.
         output_size(int|tuple|None): The output image size. If output size is a
             tuple, it must contain two integers, (image_H, image_W). None if use
             filter_size, padding, and stride to calculate output_size.
             if output_size and filter_size are specified at the same time, They
-            should follow the formula above.
+            should follow the formula above. Default: None.
         filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
             Otherwise, the filter will be a square. None if use output size to
-            calculate filter_size.
+            calculate filter_size. Default: None.
         padding(int|tuple): The padding size. If padding is a tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
@@ -2040,8 +2060,6 @@ class Conv2DTranspose(layers.Layer):
             library is installed. Default: True.
         act (str): Activation type, if it is set to None, activation is not appended.
             Default: None.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: True.
 
     Returns:
         Variable: The tensor variable storing the convolution transpose result.
@@ -2173,11 +2191,11 @@ class SequenceConv(layers.Layer):
     in the input parameters to the function.
 
     Args:
-        name_scope (str): See base class.
+        name_scope(str): The name of this class.
         num_filters (int): number of filters.
-        filter_size (int): the filter size (H and W).
-        filter_stride (int): stride of the filter.
-        padding (bool): if True, add paddings.
+        filter_size (int): the filter size (H and W). Default: 3.
+        filter_stride (int): stride of the filter. Default: 1.
+        padding (bool|None): if True, add paddings. Default: None
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, sequence_conv
@@ -2189,8 +2207,6 @@ class SequenceConv(layers.Layer):
             is not set, the parameter is initialized with Xavier. Default: None.
         act (str): Activation type, if it is set to None, activation is not appended.
             Default: None.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None.
 
     Returns:
         Variable: output of sequence_conv
@@ -2259,15 +2275,16 @@ class RowConv(layers.Layer):
     More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
 
     Args:
-        name_scope (str): See base class.
+        name_scope(str): The name of this class.
         future_context_size (int): Future context size. Please note, the shape
             of convolution kernel is [future_context_size + 1, D].
         param_attr (ParamAttr): Attributes of parameters, including
-            name, initializer etc.
-        act (str): Non-linear activation to be applied to output variable.
+            name, initializer etc. Default: None.
+        act (str): Non-linear activation to be applied to output variable. Default: None.
 
     Returns:
-        the output(Out) is a LodTensor, which supports variable time-length input sequences. The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
+        the output(Out) is a LodTensor, which supports variable time-length input sequences.
+        The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
 
     Examples:
         .. code-block:: python
@@ -2321,10 +2338,10 @@ class GroupNorm(layers.Layer):
         Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
         Args:
-            name_scope (str): See base class.
+            name_scope(str): The name of this class.
             groups(int): The number of groups that divided from channels.
             epsilon(float): The small value added to the variance to prevent
-                division by zero.
+                division by zero. Default: 1e-05.
             param_attr(ParamAttr|None): The parameter attribute for the learnable
                 scale :math:`g`. If it is set to False, no scale will be added to the output units.
                 If it is set to None, the bias is initialized one. Default: None.
@@ -2333,7 +2350,6 @@ class GroupNorm(layers.Layer):
                 If it is set to None, the bias is initialized zero. Default: None.
             act(str): Activation to be applied to the output of group normalizaiton.
             data_layout(string|NCHW): Only NCHW is supported.
-            dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
 
         Returns:
             Variable: A tensor variable which is the result after applying group normalization on the input.
@@ -2450,10 +2466,10 @@ class SpectralNorm(layers.Layer):
     Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
 
     Args:
-        name_scope (str): See base class.
-        dim(int): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer, default 0
-        power_iters(int): number of power iterations to calculate spectral norm, default 1
-        eps(float): epsilon for numerical stability in calculating norms
+        name_scope(str): The name of this class.
+        dim(int): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: 0.
+        power_iters(int): The number of power iterations to calculate spectral norm. Default: 1.
+        eps(float): The epsilon for numerical stability in calculating norms. Default: 1e-12.
         name (str): The name of this layer. It is optional.
 
     Returns:
@@ -2527,20 +2543,22 @@ class TreeConv(layers.Layer):
 
 
         Args:
-            name_scope (str): See base class.
+            name_scope(str): The name of this class.
             output_size(int): output feature width
-            num_filters(int): number of filters, Default 1
-            max_depth(int): max depth of filters, Default 2
-            act(str): activation function, Default tanh
-            param_attr(ParamAttr): the parameter attribute for the filters, Default None
-            bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None
-            name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None
+            num_filters(int): number of filters, Default: 1.
+            max_depth(int): max depth of filters, Default: 2.
+            act(str): activation function, Default: tanh.
+            param_attr(ParamAttr): the parameter attribute for the filters, Default: None.
+            bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default: None.
+            name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default: None.
 
         Returns:
             out(Variable): (Tensor) The feature vector of subtrees. The shape of the output tensor is [max_tree_node_size, output_size, num_filters]. The output tensor could be a new feature vector for next tree convolution layers
 
         Examples:
+
             .. code-block:: python
+
               import paddle.fluid as fluid
               import numpy
 
-- 
GitLab