diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 5e80f307eeeef8a2a44a505086b88bb3d6a4804b..ff27237327f63213ce0aafb14fb6c4019209662b 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -27,9 +27,6 @@ from ...fluid.layers.layer_function_generator import templatedoc
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     """
-	:alias_main: paddle.nn.functional.diag_embed
-	:alias: paddle.nn.functional.diag_embed,paddle.nn.functional.extension.diag_embed
-
     This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2) 
     are filled by ``input``. By default, a 2D plane formed by the last two dimensions 
     of the returned tensor will be selected.
@@ -41,60 +38,59 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     - If offset < 0, it is below the main diagonal.
 
     Args:
-        input(Variable|numpy.ndarray): The input tensor. Must be at least 1-dimensional. The input data type should be float32, float64, int32, int64.
+        input(Tensor|numpy.ndarray): The input tensor. Must be at least 1-dimensional. The input data type should be float32, float64, int32, int64.
         offset(int, optional): Which diagonal to consider. Default: 0 (main diagonal).
         dim1(int, optional): The first dimension with respect to which to take diagonal. Default: -2.
         dim2(int, optional): The second dimension with respect to which to take diagonal. Default: -1.
     
     Returns:
-        Variable, the output data type is the same as input data type.
+        Tensor, the output data type is the same as input data type.
     
     Examples:
         .. code-block:: python
 
             import paddle.nn.functional as F
-            import paddle.fluid.dygraph as dg
             import numpy as np
             
             diag_embed = np.random.randn(2, 3).astype('float32')
             # [[ 0.7545889 , -0.25074545,  0.5929117 ],
             #  [-0.6097662 , -0.01753256,  0.619769  ]]
-            with dg.guard():
-                data1 = F.diag_embed(diag_embed)
-                data1.numpy()
-                # [[[ 0.7545889 ,  0.        ,  0.        ],
-                #  [ 0.        , -0.25074545,  0.        ],
-                #   [ 0.        ,  0.        ,  0.5929117 ]],
-
-                # [[-0.6097662 ,  0.        ,  0.        ],
-                #  [ 0.        , -0.01753256,  0.        ],
-                #  [ 0.        ,  0.        ,  0.619769  ]]]
-
-                data2 = F.diag_embed(diag_embed, offset=-1, dim1=0, dim2=2)
-                data2.numpy()
-                # [[[ 0.        ,  0.        ,  0.        ,  0.        ],
-                #   [ 0.7545889 ,  0.        ,  0.        ,  0.        ],
-                #   [ 0.        , -0.25074545,  0.        ,  0.        ],
-                #   [ 0.        ,  0.        ,  0.5929117 ,  0.        ]],
-                #
-                #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
-                #   [-0.6097662 ,  0.        ,  0.        ,  0.        ],
-                #   [ 0.        , -0.01753256,  0.        ,  0.        ],
-                #   [ 0.        ,  0.        ,  0.619769  ,  0.        ]]]
-
-                data3 = F.diag_embed(diag_embed, offset=1, dim1=0, dim2=2)
-                data3.numpy()
-                # [[[ 0.        ,  0.7545889 ,  0.        ,  0.        ],
-                #   [ 0.        , -0.6097662 ,  0.        ,  0.        ]],
-                #
-                #  [[ 0.        ,  0.        , -0.25074545,  0.        ],
-                #   [ 0.        ,  0.        , -0.01753256,  0.        ]],
-                #
-                #  [[ 0.        ,  0.        ,  0.        ,  0.5929117 ],
-                #   [ 0.        ,  0.        ,  0.        ,  0.619769  ]],
-                #
-                #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
-                #   [ 0.        ,  0.        ,  0.        ,  0.        ]]]
+
+            data1 = F.diag_embed(diag_embed)
+            data1.numpy()
+            # [[[ 0.7545889 ,  0.        ,  0.        ],
+            #  [ 0.        , -0.25074545,  0.        ],
+            #   [ 0.        ,  0.        ,  0.5929117 ]],
+
+            # [[-0.6097662 ,  0.        ,  0.        ],
+            #  [ 0.        , -0.01753256,  0.        ],
+            #  [ 0.        ,  0.        ,  0.619769  ]]]
+
+            data2 = F.diag_embed(diag_embed, offset=-1, dim1=0, dim2=2)
+            data2.numpy()
+            # [[[ 0.        ,  0.        ,  0.        ,  0.        ],
+            #   [ 0.7545889 ,  0.        ,  0.        ,  0.        ],
+            #   [ 0.        , -0.25074545,  0.        ,  0.        ],
+            #   [ 0.        ,  0.        ,  0.5929117 ,  0.        ]],
+            #
+            #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
+            #   [-0.6097662 ,  0.        ,  0.        ,  0.        ],
+            #   [ 0.        , -0.01753256,  0.        ,  0.        ],
+            #   [ 0.        ,  0.        ,  0.619769  ,  0.        ]]]
+
+            data3 = F.diag_embed(diag_embed, offset=1, dim1=0, dim2=2)
+            data3.numpy()
+            # [[[ 0.        ,  0.7545889 ,  0.        ,  0.        ],
+            #   [ 0.        , -0.6097662 ,  0.        ,  0.        ]],
+            #
+            #  [[ 0.        ,  0.        , -0.25074545,  0.        ],
+            #   [ 0.        ,  0.        , -0.01753256,  0.        ]],
+            #
+            #  [[ 0.        ,  0.        ,  0.        ,  0.5929117 ],
+            #   [ 0.        ,  0.        ,  0.        ,  0.619769  ]],
+            #
+            #  [[ 0.        ,  0.        ,  0.        ,  0.        ],
+            #   [ 0.        ,  0.        ,  0.        ,  0.        ]]]
     """
     inputs = {'Input': [input]}
     attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
@@ -151,15 +147,15 @@ def row_conv(input, weight, act=None):
     ${comment}
 
     Args:
-        input (Variable):  the input(X) is a LodTensor or tensor, LodTensor(X) 
-            supports variable  time-length input sequences. The underlying 
+        input (Tensor):  the input(X) is a LodTensor or tensor, LodTensor(X) 
+            supports variable time-length input sequences. The underlying 
             tensor in this LoDTensor is a matrix with shape (T, D), where 
             T is the total time steps in this mini-batch and D is the input 
             data dimension. 
             If the input is a padded minibatch, the shape of the input is 
             (N, T, D), N is batch size, T is the max time steps in the batch,
              D is the input data dimension.
-        weight (Variable): The weight. A Tensor with shape 
+        weight (Tensor): The weight. A Tensor with shape 
             (future_context_size + 1, D), where future_context_size is the 
             context size of the RowConv operator.
         act (str): Non-linear activation to be applied to output variable.
@@ -171,7 +167,6 @@ def row_conv(input, weight, act=None):
         .. code-block:: python
 
             from paddle import fluid, nn
-            import paddle.fluid.dygraph as dg
             import paddle.nn.functional as F
             import numpy as np
 
@@ -182,16 +177,12 @@ def row_conv(input, weight, act=None):
             x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32)
             weight = np.random.randn(context_size + 1, feature_size).astype(np.float32)
 
-            place = fluid.CPUPlace()
-            with dg.guard(place):
-                x_var = dg.to_variable(x)
-                w_var = dg.to_variable(weight)
-                y_var = F.extension.row_conv(x_var, w_var)
-                y_np = y_var.numpy()
-
-            print(y_np.shape)
+            x_var = paddle.to_tensor(x)
+            w_var = paddle.to_tensor(weight)
+            y_var = F.extension.row_conv(x_var, w_var)
+            print(y_var.shape)
 
-            # (4, 8, 6)
+            # [4, 8, 6]
     """
 
     if in_dygraph_mode():
diff --git a/python/paddle/nn/layer/extension.py b/python/paddle/nn/layer/extension.py
index 3972a1b7157129eca1149f879c2fc1431ec989f6..3505a759c91cb1f3f4f4be838ae3af5a36dd2d53 100644
--- a/python/paddle/nn/layer/extension.py
+++ b/python/paddle/nn/layer/extension.py
@@ -20,9 +20,6 @@ from .. import functional as F
 
 class RowConv(layers.Layer):
     """
-	:alias_main: paddle.nn.RowConv
-	:alias: paddle.nn.RowConv,paddle.nn.layer.RowConv,paddle.nn.layer.extension.RowConv
-
     **Row-convolution operator**
 
     The row convolution is called lookahead convolution.  This operator was 
@@ -50,7 +47,7 @@ class RowConv(layers.Layer):
             of convolution kernel is [future_context_size + 1, D].
         param_attr (ParamAttr): Attributes of parameters, including
             name, initializer etc. Default: None.
-        act (str): Non-linear activation to be applied to output variable. Default: None.
+        act (str): Non-linear activation to be applied to output tensor. Default: None.
         dtype (str, optional): Data type, it can be "float32". Default: "float32".
 
     Attributes:
@@ -63,8 +60,7 @@ class RowConv(layers.Layer):
     Examples:
         .. code-block:: python
 
-          from paddle import fluid, nn
-          import paddle.fluid.dygraph as dg
+          from paddle import nn
           import paddle.nn.functional as F
           import numpy as np
 
@@ -75,15 +71,12 @@ class RowConv(layers.Layer):
 
           x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32)
 
-          place = fluid.CPUPlace()
-          with dg.guard(place):
-              x_var = dg.to_variable(x)
-              conv = nn.RowConv(feature_size, context_size)
-              y_var = conv(x_var)
-              y_np = y_var.numpy()
-          print(y_np.shape)
+          x = paddle.to_tensor(x)
+          conv = nn.RowConv(feature_size, context_size)
+          y = conv(x)
+          print(y.shape)
 
-          # (4, 8, 6)
+          # [4, 8, 6]
     """
 
     def __init__(self,
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 388dddf262ae0546a24d9fe40a2f992657efe102..d06623a2b910f2cbc17cc13bd1108c53da5ecc3a 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -273,12 +273,11 @@ class SimpleRNNCell(RNNCellBase):
     The formula used is as follows:
 
     .. math::
-        h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
 
         y_{t} & = h_{t}
     
-    where :math:`act` is for :attr:`activation` , and * is the elemetwise
-    multiplication operator.
+    where :math:`act` is for :attr:`activation`.
 
     Please refer to `Finding Structure in Time 
     <https://crl.ucsd.edu/~elman/Papers/fsit.pdf>`_ for more details.
@@ -289,46 +288,32 @@ class SimpleRNNCell(RNNCellBase):
         activation (str, optional): The activation in the SimpleRNN cell. 
             It can be `tanh` or `relu`. Defaults to `tanh`.
         weight_ih_attr (ParamAttr, optional): The parameter attribute for 
-            `weight_ih`. Default: None.
+            :math:`weight_ih`. Default: None.
         weight_hh_attr(ParamAttr, optional): The parameter attribute for 
-            `weight_hh`. Default: None.
+            :math:`weight_hh`. Default: None.
         bias_ih_attr (ParamAttr, optional): The parameter attribute for the 
-            `bias_ih`. Default: None.
+            :math:`bias_ih`. Default: None.
         bias_hh_attr (ParamAttr, optional): The parameter attribute for the 
-            `bias_hh`. Default: None.
+            :math:`bias_hh`. Default: None.
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Attributes:
-        weight_ih (Parameter): shape (hidden_size, input_size), input to hidden 
-            weight, corresponding to :math:`W_{ih}` in the formula.
-        weight_hh (Parameter): shape (hidden_size, hidden_size), hidden to 
-            hidden weight, corresponding to :math:`W_{hh}` in the formula.
-        bias_ih (Parameter): shape (hidden_size, ), input to hidden bias, 
-            corresponding to :math:`b_{ih}` in the formula.
-        bias_hh (Parameter): shape (hidden_size, ), hidden to hidden bias, 
-            corresponding to :math:`b_{hh}` in the formula.
+    Variables:
+        - **weight_ih** (Parameter): shape (hidden_size, input_size), input to hidden weight, corresponding to :math:`W_{ih}` in the formula.
+        - **weight_hh** (Parameter): shape (hidden_size, hidden_size), hidden to hidden weight, corresponding to :math:`W_{hh}` in the formula.
+        - **bias_ih** (Parameter): shape (hidden_size, ), input to hidden bias, corresponding to :math:`b_{ih}` in the formula.
+        - **bias_hh** (Parameter): shape (hidden_size, ), hidden to hidden bias, corresponding to :math:`b_{hh}` in the formula.
     
     Inputs:
-        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
-                corresponding to :math:`x_t` in the formula.
-        states (Tensor, optional): shape `[batch_size, hidden_size]`, the
-            previous hidden state, corresponding to :math:`h_{t-1}` in the 
-            formula. When states is None, zero state is used. Defaults to 
-            None.
+        - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_{t}` in the formula.
+        - **states** (Tensor, optional): shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
 
     Returns:
-        (outputs, new_states)
-        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
-            corresponding to :math:`h_{t}` in the formula.
-        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
-            state, corresponding to :math:`h_{t}` in the formula.
+        - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
+        - **states** (Tensor): shape `[batch_size, hidden_size]`, the new hidden state, corresponding to :math:`h_{t}` in the formula.
     
     Notes:
-        All the weights and bias are initialized with `Uniform(-std, std)` by 
-        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
-        information about parameter initialization, please refer to
-         :ref:`api_fluid_ParamAttr`.
+        All the weights and bias are initialized with `Uniform(-std, std)` by default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more information about parameter initialization, please refer to :ref:`api_fluid_ParamAttr`.
 
     Examples:
 
@@ -448,41 +433,24 @@ class LSTMCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Attributes:
-        weight_ih (Parameter): shape (4 * hidden_size, input_size), input to 
-            hidden weight, which corresponds to the concatenation of
-             :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
-        weight_hh (Parameter): shape (4 * hidden_size, hidden_size), hidden to 
-            hidden weight, which corresponds to the concatenation of
-             :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
-        bias_ih (Parameter): shape (4 * hidden_size, ), input to hidden bias, 
-            which corresponds to the concatenation of
-             :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
-        bias_hh (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, 
-            which corresponds to the concatenation of
-             :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
+    Variables:
+        - **weight_ih** (Parameter): shape (4 * hidden_size, input_size), input to hidden weight, which corresponds to the concatenation of :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
+        - **weight_hh** (Parameter): shape (4 * hidden_size, hidden_size), hidden to hidden weight, which corresponds to the concatenation of :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
+        - **bias_ih** (Parameter): shape (4 * hidden_size, ), input to hidden bias, which corresponds to the concatenation of :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
+        - **bias_hh** (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, swhich corresponds to the concatenation of :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
 
     Inputs:
-        inputs (Tensor): shape `[batch_size, input_size]`, the input, 
-            corresponding to :math:`x_t` in the formula.
-        states (tuple, optional): a tuple of two tensors, each of shape 
-            `[batch_size, hidden_size]`, the previous hidden state, 
-            corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. 
-            When states is None, zero state is used. Defaults to None.
+        - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_t` in the formula.
+        - **states** (tuple, optional): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
 
     Returns:
-        (outputs, new_states)
-        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
-            corresponding to :math:`h_{t}` in the formula.
-        states (tuple): a tuple of two tensors, each of shape 
-            `[batch_size, hidden_size]`, the new hidden states,
-            corresponding to :math:`h_{t}, c_{t}` in the formula.
+        - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
+        - **states** (tuple): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.
 
     Notes:
         All the weights and bias are initialized with `Uniform(-std, std)` by 
-        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
-        information about parameter initialization, please refer to
-         :ref:`api_fluid_ParamAttr`.
+        default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more 
+        information about parameter initialization, please refer to :ref:`api_fluid_ParamAttr`.
 
     Examples:
 
@@ -582,11 +550,11 @@ class GRUCell(RNNCellBase):
 
     ..  math::
 
-        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t-1} + b_{hr})
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
 
-        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t-1} + b_{hz})
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
 
-        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t-1} + b_{hc}))
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
 
         h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
 
@@ -599,7 +567,7 @@ class GRUCell(RNNCellBase):
     <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
 
     Parameters:
-        input_size (int): The input size..
+        input_size (int): The input size.
         hidden_size (int): The hidden size.
         weight_ih_attr(ParamAttr, optional): The parameter attribute for 
             `weight_ih`. Default: None.
@@ -612,38 +580,24 @@ class GRUCell(RNNCellBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Attributes:
-        weight_ih (Parameter): shape (3 * hidden_size, input_size), input to 
-            hidden weight, which corresponds to the concatenation of
-             :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
-        weight_hh (Parameter): shape (3 * hidden_size, hidden_size), hidden to 
-            hidden weight, which corresponds to the concatenation of
-             :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
-        bias_ih (Parameter): shape (3 * hidden_size, ), input to hidden bias, 
-            which corresponds to the concatenation of
-             :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
-        bias_hh (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, 
-            which corresponds to the concatenation of
-             :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
+    Variables:
+        - **weight_ih** (Parameter): shape (3 * hidden_size, input_size), input to hidden weight, which corresponds to the concatenation of :math:`W_{ir}, W_{iz}, W_{ic}` in the formula.
+        - **weight_hh** (Parameter): shape (3 * hidden_size, hidden_size), hidden to hidden weight, which corresponds to the concatenation of :math:`W_{hr}, W_{hz}, W_{hc}` in the formula.
+        - **bias_ih** (Parameter): shape (3 * hidden_size, ), input to hidden bias, which corresponds to the concatenation of :math:`b_{ir}, b_{iz}, b_{ic}` in the formula.
+        - **bias_hh** (Parameter): shape (3 * hidden_size, ), hidden to hidden bias, swhich corresponds to the concatenation of :math:`b_{hr}, b_{hz}, b_{hc}` in the formula.
 
     Inputs:
-        inputs (Tensor): A tensor with shape `[batch_size, input_size]`,
-            corresponding to :math:`x_t` in the formula.
-        states (Tensor): A tensor with shape `[batch_size, hidden_size]`.
-            corresponding to :math:`h_{t-1}` in the formula.
+        - **inputs** (Tensor): A tensor with shape `[batch_size, input_size]`, corresponding to :math:`x_t` in the formula.
+        - **states** (Tensor): A tensor with shape `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}` in the formula.
 
     Returns:
-        (outputs, new_states)
-        outputs (Tensor): shape `[batch_size, hidden_size]`, the output, 
-            corresponding to :math:`h_{t}` in the formula.
-        states (Tensor): shape `[batch_size, hidden_size]`, the new hidden 
-            state, corresponding to :math:`h_{t}` in the formula.
+        - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
+        - **states** (Tensor): shape `[batch_size, hidden_size]`, the new hidden state, corresponding to :math:`h_{t}` in the formula.
     
     Notes:
         All the weights and bias are initialized with `Uniform(-std, std)` by 
-        default. Where std = :math:`\frac{1}{\sqrt{hidden_size}}`. For more 
-        information about parameter initialization, please refer to
-         :ref:`api_fluid_ParamAttr`.
+        default. Where std = :math:`\frac{1}{\sqrt{hidden\_size}}`. For more 
+        information about parameter initialization, please refer to s:ref:`api_fluid_ParamAttr`.
 
     Examples:
 
@@ -745,32 +699,14 @@ class RNN(Layer):
             time steps. Defaults to False.
 
     Inputs:
-        inputs (Tensor): A (possibly nested structure of) tensor[s]. The input 
-            sequences. 
-            If time major is False, the shape is `[batch_size, time_steps, input_size]`
-            If time major is True, the shape is `[time_steps, batch_size, input_size]`
-            where `input_size` is the input size of the cell.
-        initial_states (Tensor|list|tuple, optional): Tensor of a possibly 
-            nested structure of tensors, representing the initial state for 
-            the rnn cell. If not provided, `cell.get_initial_states` would be 
-            called to produce the initial states. Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whose time step 
-            index are not less than the valid length are treated as paddings.
-        **kwargs: Additional keyword arguments to pass to `forward` of the cell. 
+        - **inputs** (Tensor): A (possibly nested structure of) tensor[s]. The input sequences. If time major is False, the shape is `[batch_size, time_steps, input_size]`. If time major is True, the shape is `[time_steps, batch_size, input_size]` where `input_size` is the input size of the cell.
+        - **initial_states** (Tensor|list|tuple, optional): Tensor of a possibly nested structure of tensors, representing the initial state for the rnn cell. If not provided, `cell.get_initial_states` would be called to produce the initial states. Defaults to None.
+        - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None.If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
+        - **kwargs**: Additional keyword arguments to pass to `forward` of the cell. 
 
     Returns:
-        (outputs, final_states)
-        outputs (Tensor|list|tuple): the output sequences.
-            If `time_major` is True, the shape is 
-            `[time_steps, batch_size, hidden_size]`, else 
-            `[batch_size, time_steps, hidden_size]`.
-        final_states (Tensor|list|tuple): final states of the cell. Tensor or 
-            a possibly nested structure of tensors which has the same structure 
-            with intial state. Each tensor in final states has the same shape 
-            and dtype as the corresponding tensor in initial states.
+        - **outputs** (Tensor|list|tuple): the output sequences. If `time_major` is True, the shape is `[time_steps, batch_size, hidden_size]`, else `[batch_size, time_steps, hidden_size]`.
+        - **final_states** (Tensor|list|tuple): final states of the cell. Tensor or a possibly nested structure of tensors which has the same structure with intial state. Each tensor in final states has the same shape and dtype as the corresponding tensor in initial states.
     
     Notes:
         This class is a low level API for wrapping rnn cell into a RNN network.
@@ -838,33 +774,14 @@ class BiRNN(Layer):
             time steps. Defaults to False.
 
     Inputs:
-        inputs (Tensor): the input sequences of both RNN. 
-            If time_major is True, the shape of is 
-            `[time_steps, batch_size, input_size]`, else the shape is
-            `[batch_size, time_steps, input_size]`, where input_size is the 
-            input size of both cells.
-        initial_states (list|tuple, optional): A tuple/list of the initial 
-            states of the forward cell and backward cell. Defaults to None.
-            If not provided, `cell.get_initial_states` would be called to 
-            produce the initial states for each cell. Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whose time step 
-            index are not less than the valid length are treated as paddings.
-        **kwargs: Additional keyword arguments. Arguments passed to `forward` 
-            for each cell.
+        - **inputs** (Tensor): the input sequences of both RNN. If time_major is True, the shape of is `[time_steps, batch_size, input_size]`, else the shape is `[batch_size, time_steps, input_size]`, where input_size is the input size of both cells.
+        - **initial_states** (list|tuple, optional): A tuple/list of the initial states of the forward cell and backward cell. Defaults to None. If not provided, `cell.get_initial_states` would be called to produce the initial states for each cell. Defaults to None.
+        - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
+        - **kwargs**: Additional keyword arguments. Arguments passed to `forward` for each cell.
 
     Outputs:
-        (outputs, final_states)
-        outputs (Tensor): the outputs of the bidirectional RNN. It is the 
-            concatenation of the outputs from the forward RNN and backward 
-            RNN along the last axis. 
-            If time major is True, the shape is `[time_steps, batch_size, size]`,
-            else the shape is `[batch_size, time_steps, size]`, where size is
-            `cell_fw.hidden_size + cell_bw.hidden_size`.
-        final_states (tuple): A tuple of the final states of the forward 
-            cell and backward cell. 
+        - **outputs** (Tensor): the outputs of the bidirectional RNN. It is the concatenation of the outputs from the forward RNN and backward RNN along the last axis. If time major is True, the shape is `[time_steps, batch_size, size]`, else the shape is `[batch_size, time_steps, size]`, where size is `cell_fw.hidden_size + cell_bw.hidden_size`.
+        - **final_states** (tuple): A tuple of the final states of the forward cell and backward cell. 
 
     Notes:
         This class is a low level API for wrapping rnn cells into a BiRNN 
@@ -1150,12 +1067,11 @@ class SimpleRNN(RNNBase):
 
     .. math::
 
-        h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h{t-1} + b_{hh})
+        h_{t} & = act(W_{ih}x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
 
         y_{t} & = h_{t}
     
-    where :math:`act` is for :attr:`activation` , and * is the elemetwise
-    multiplication operator.
+    where :math:`act` is for :attr:`activation`.
 
     Using key word arguments to construct is recommended.
 
@@ -1183,43 +1099,20 @@ class SimpleRNN(RNNBase):
         name (str, optional): Name for the operation (optional, default is 
             None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Inputs:
-        inputs (Tensor): the input sequence. 
-            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
-            else, the shape is `[batch_size, time_steps, hidden_size]`.
-        initial_states (Tensor, optional): the initial state. The shape is
-            `[num_layers * num_directions, batch_size, hidden_size]`. 
-            If initial_state is not given, zero initial states are used.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whose time step 
-            index are not less than the valid length are treated as paddings.
+    Inputs:s
+        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`.
+        - **initial_states** (Tensor, optional): the initial state. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
+        - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
 
     Returns:
-        (outputs, final_states)
-        outputs (Tensor): the output sequence. 
-            If `time_major` is True, the shape is 
-            `[time_steps, batch_size, num_directions * hidden_size]`,
-            else, the shape is 
-            `[batch_size, time_steps, num_directions * hidden_size]`.
-            Note that `num_directions` is 2 if direction is "bidirectional" 
-            else 1.
-        final_states (Tensor): final states. The shape is
-            `[num_layers * num_directions, batch_size, hidden_size]`.
-            Note that `num_directions` is 2 if direction is "bidirectional" 
-            else 1.
-
-    Attributes:
-        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
-            If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise,
-            the shape is `[hidden_size, num_directions * hidden_size]`.
-        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
-            with shape `[hidden_size, hidden_size]`.
-        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
-            with shape `[hidden_size]`.
-        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
-            with shape `[hidden_size]`.
+        - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, else, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1.
+        - **final_states** (Tensor): final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1.
+
+    Variables:
+        - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
+        - **weight_hh_l[k]**: the learnable hidden-hidden weights of the k-th layer, with shape `[hidden_size, hidden_size]`.
+        - **bias_ih_l[k]**: the learnable input-hidden bias of the k-th layer, with shape `[hidden_size]`.
+        - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, with shape `[hidden_size]`.
 
     Examples:
 
@@ -1321,43 +1214,19 @@ class LSTM(RNNBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
-        inputs (Tensor): the input sequence. 
-            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
-            else, the shape is `[batch_size, time_steps, hidden_size]`.
-        initial_states (tuple, optional): the initial state, a tuple of (h, c), 
-            the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. 
-            If initial_state is not given, zero initial states are used.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whos time step 
-            index are not less than the valid length are treated as paddings.
+        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`.
+        - **initial_states** (tuple, optional): the initial state, a tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
+        - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
 
     Returns:
-        (outputs, final_states)
-        outputs (Tensor): the output sequence. 
-            If `time_major` is True, the shape is 
-            `[time_steps, batch_size, num_directions * hidden_size]`, 
-            If `time_major` is False, the shape is 
-            `[batch_size, time_steps, num_directions * hidden_size]`. 
-            Note that `num_directions` is 2 if direction is "bidirectional" 
-            else 1. 
-        final_states (tuple): the final state, a tuple of two tensors, h and c. 
-            The shape of each is 
-            `[num_layers * num_directions, batch_size, hidden_size]`. 
-            Note that `num_directions` is 2 if direction is "bidirectional" 
-            else 1.
-
-    Attributes:
-        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
-            If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise,
-            the shape is `[hidden_size, num_directions * hidden_size]`.
-        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
-            with shape `[hidden_size, hidden_size]`.
-        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
-            with shape `[hidden_size]`.
-        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
-            with shape `[hidden_size]`.
+        - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, If `time_major` is False, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. 
+        - **final_states** (tuple): the final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1.
+
+    Variables:
+        - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
+        - **weight_hh_l[k]**: the learnable hidden-hidden weights of the k-th layer, with shape `[hidden_size, hidden_size]`.
+        - **bias_ih_l[k]**: the learnable input-hidden bias of the k-th layer, with shape `[hidden_size]`.
+        - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, swith shape `[hidden_size]`.
 
     Examples:
     
@@ -1412,11 +1281,11 @@ class GRU(RNNBase):
 
     .. math::
 
-        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t-1} + b_{hr})
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
 
-        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t-1} + b_{hz})
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
 
-        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t-1} + b_{hc}))
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
 
         h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
 
@@ -1450,43 +1319,19 @@ class GRU(RNNBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
-        inputs (Tensor): the input sequence. 
-            If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`,
-            else, the shape is `[batch_size, time_steps, hidden_size]`.
-        initial_states (Tensor, optional): the initial state. The shape is
-            `[num_layers * num_directions, batch_size, hidden_size]`. 
-            If initial_state is not given, zero initial states are used. 
-            Defaults to None.
-        sequence_length (Tensor, optional): shape `[batch_size]`, dtype: int64 
-            or int32. The valid lengths of input sequences. Defaults to None.
-            If `sequence_length` is not None, the inputs are treated as 
-            padded sequences. In each input sequence, elements whos time step 
-            index are not less than the valid length are treated as paddings.
+        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`.
+        - **initial_states** (Tensor, optional): the initial state. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used. Defaults to None.
+        - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
 
     Returns:
-        (outputs, final_states)
-        outputs (Tensor): the output sequence. 
-            If `time_major` is True, the shape is 
-            `[time_steps, batch_size, num_directions * hidden_size]`,
-            else, the shape is 
-            `[batch_size, time_steps, num_directions * hidden_size]`.
-            Note that `num_directions` is 2 if direction is "bidirectional" 
-            else 1.
-        final_states (Tensor): final states. The shape is
-            `[num_layers * num_directions, batch_size, hidden_size]`.
-            Note that `num_directions` is 2 if direction is "bidirectional" 
-            else 1.
-
-    Attributes:
-        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
-            If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise,
-            the shape is `[hidden_size, num_directions * hidden_size]`.
-        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
-            with shape `[hidden_size, hidden_size]`.
-        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
-            with shape `[hidden_size]`.
-        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
-            with shape `[hidden_size]`.
+        - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, else, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1.
+        - **final_states** (Tensor): final states. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1.
+
+    Variables:
+        - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
+        - **weight_hh_l[k]**: the learnable hidden-hidden weights of the k-th layer, with shape `[hidden_size, hidden_size]`.
+        - **bias_ih_l[k]**: the learnable input-hidden bias of the k-th layer, with shape `[hidden_size]`.
+        - **bias_hh_l[k]**: the learnable hidden-hidden bias of the k-th layer, with shape `[hidden_size]`.
 
     Examples: