From 20f0878f7085678fefdd86df181222ca52e1a1e8 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Fri, 27 Sep 2019 20:50:59 +0800
Subject: [PATCH] Fix en docs of apis (#20050)

* fix en docs of apis, test=develop, test=document_fix

* follow chunwei's comments, test=develop
---
 paddle/fluid/API.spec             |  12 +--
 python/paddle/fluid/clip.py       |   6 +-
 python/paddle/fluid/layers/nn.py  |  85 ++++++++-------------
 python/paddle/fluid/lod_tensor.py | 118 +++++++++++++++---------------
 python/paddle/fluid/optimizer.py  |  25 ++++---
 python/paddle/fluid/param_attr.py |  38 +++++-----
 6 files changed, 134 insertions(+), 150 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 74c3920674b..0a0c3663f3c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -48,8 +48,8 @@ paddle.fluid.ParallelExecutor ('paddle.fluid.parallel_executor.ParallelExecutor'
 paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40'))
 paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '0af092676e5b1320bb4232396154ce4b'))
-paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
-paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
+paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', '0627369b86ff974f433f7078d1e78349'))
+paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '4829bd8c4a4f1b19438500def321cb65'))
 paddle.fluid.DataFeedDesc ('paddle.fluid.data_feed_desc.DataFeedDesc', ('document', '43877a0d9357db94d3dbc7359cbe8c73'))
 paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '9c6615854b61caa5f0d3e6ccc5e51338'))
@@ -133,7 +133,7 @@ paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_at
 paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9045b8971e4232132ec9952695f4c3ae'))
 paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '5ce117258e243be1c81539e254178d90'))
 paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '8e6ce424cf9e261ef32ee229c06a6e66'))
-paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'f43c659ca1749a3f0ff2231e6dfda07d'))
+paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', '789a141e97fd0b37241f630935936d08'))
 paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6263dfdeb6c670fa0922c9cbc8fb1bf4'))
 paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'bbb9e708bab250359864fefbdf48e9d9'))
 paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c'))
@@ -978,7 +978,7 @@ paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss',
 paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', '3f1c5385519a3674c18c3a1ab34ac04f'))
+paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', 'e132700f81e9c5d27a7b3cd32b38d714'))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
@@ -1062,7 +1062,7 @@ paddle.fluid.CUDAPlace ('paddle.fluid.core_avx.CUDAPlace', ('document', '6a6cd8e
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('document', 'afd58ea5d390b5ea06ca70291a266d45'))
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
-paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'cd667b4ee96d7d6fca40aa722d67d744'))
+paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'a4d4d13ce9eeb86bbaa7ab935c207577'))
 paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'b5ae1698ea72d5a9428000b916a67379'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -1071,7 +1071,7 @@ paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'p
 paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'a0ed5ce816b5d603cb595aacb922335a'))
 paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc'))
 paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '334c6af750941a4397a2dd2ea8a4d76f'))
-paddle.fluid.clip.set_gradient_clip (ArgSpec(args=['clip', 'param_list', 'program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '77ca02bb37b70d226510df9cf5e45965'))
+paddle.fluid.clip.set_gradient_clip (ArgSpec(args=['clip', 'param_list', 'program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'a0b00ccc8584b4a1cf4ec5aa74780e77'))
 paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', 'e6f815a03be88dee2537707d9e6b9209'))
 paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.clip.GradientClipByValue ('paddle.fluid.clip.GradientClipByValue', ('document', 'b7a22f687269cae0c338ef3866322db7'))
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index aeef8505f8e..2188c069b8b 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -342,13 +342,13 @@ def set_gradient_clip(clip, param_list=None, program=None):
     To specify parameters that require gradient clip.
 
     Args:
-        clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
+        clip (BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
                 for example :ref:`api_fluid_clip_GradientClipByGlobalNorm` ,
                 which describes the type and detailed attributes of required gradient clip.
-        param_list(list(Variable), optional): Parameters that require gradient clip.
+        param_list (list(Variable), optional): Parameters that require gradient clip.
                 It can be a list of parameter or a list of parameter's name.
                 Default None, meaning that all parameters in the program will be included.
-        program(Program, optional): The program where parameters are located.
+        program (Program, optional): The program where parameters are located.
                 Default None, meaning that using :ref:`api_fluid_default_main_program` .
 
     Returns:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8f6c8a5d127..0632ad9ad2c 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1680,76 +1680,53 @@ def dropout(x,
 
 def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
     """
-    **Cross Entropy Layer**
+    This operator computes the cross entropy between input and label. It
+    supports both hard-label and and soft-label cross entropy computation.
 
-    This layer computes the cross entropy between `input` and `label`. It
-    supports both standard cross-entropy and soft-label cross-entropy loss
-    computation.
-
-    1) One-hot cross-entropy:
-        `soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+    1. Hard-label cross entropy: if soft_label=False, :math:`label[i_1, i_2, ..., i_k]`
+       is the hard label of each sample.
 
         .. math::
 
-            Y[i] = -\log(X[i, Label[i]])
+           output[i_1, i_2, ..., i_k]=-log(input[i_1, i_2, ..., i_k, j]), label[i_1, i_2, ..., i_k] = j, j != ignore\_index
 
-    2) Soft-label cross-entropy:
-        `soft_label = True`, `Label[i, j]` indicates the soft label of class j
-        for sample i:
+    2. Soft-label cross entropy: if soft_label=True,  :math:`label[i_1, i_2, ..., i_k, j]`
+       is the soft label of each sample corresponding to the j-th class.
 
         .. math::
 
-            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
-
-       Please make sure that in this case the summation of each row of `label`
-       equals one.
-
-    3) One-hot cross-entropy with vecterized `label`:
-         As a special case of 2), when each row of 'label' has only one
-         non-zero element which is equal to 1, soft-label cross-entropy degenerates
-         to a one-hot cross-entropy with one-hot label representation.
+           output[i_1, i_2, ..., i_k]= -\sum_{j}label[i_1,i_2,...,i_k,j]*log(input[i_1, i_2, ..., i_k,j])
 
     Args:
-        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
-                                batch size and D is the number of classes. This
-                                input is a probability computed by the previous
-                                operator, which is almost always the result of
-                                a softmax operator.
-        label (Variable|list): the ground truth which is a 2-D tensor. When
-                               `soft_label` is set to `False`, `label` is a
-                               tensor<int64> with shape [N x 1]. When
-                               `soft_label` is set to `True`, `label` is a
-                               tensor<float/double> with shape [N x D].
-        soft_label (bool): a flag indicating whether to
-                                           interpretate the given labels as soft
-                                           labels. Default: `False`.
-        ignore_index (int): Specifies a target value that is ignored and does
-                            not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: kIgnoreIndex
+        input (Variable): a multidimensional Tensor with shape
+                :math:`[N_1, N_2, ..., N_k, D]`, where the last dimension D is
+                the class number. The data type should be float32 or float64.
+        label (Variable): label value corresponding to input. If
+                soft_label=False, the dimension of label should be :math:`[N_1, N_2, ..., N_k]`
+                or :math:`[N_1, N_2, ..., N_k, 1]` , and its data type should be int64,
+                and the value must be inside [0, D). If soft_label=True, the shape,
+                data type of label should be the same with input, and the sum of
+                soft label value of each sample should be 1.
+        soft_label (bool): indicate whether label is soft. Default False, meaning that
+                the label is hard. If soft_label=True, the label is soft.
+        ignore_index (int): specify an ignorable label value. The ignored label would be
+                omitted when computing. If it is a negative integer, no label would
+                be ignored. Only valid when soft_label=False. Default -100.
 
     Returns:
-         A 2-D tensor with shape [N x 1], the cross entropy loss.
-
-    Raises:
-         ValueError:
-
-                      1. the 1st dimension of ``input`` and ``label`` are not equal.
-
-                      2. when ``soft_label == True``, and the 2nd dimension of
-                         ``input`` and ``label`` are not equal.
-
-                      3. when ``soft_label == False``, and the 2nd dimension of
-                         ``label`` is not 1.
+         A Variable holding Tensor representing the cross entropy, whose data type is the same with input.
+         If soft_label=False, the shape of output is the same with label.
+         If soft_label=True, the shape of output is :math:`[N_1, N_2, ..., N_k, 1]` .
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          classdim = 7
-          x = fluid.layers.data(name='x', shape=[3, 7], dtype='float32', append_batch_size=False)
-          label = fluid.layers.data(name='label', shape=[3, 1], dtype='float32', append_batch_size=False)
-          predict = fluid.layers.fc(input=x, size=classdim, act='softmax')
-          cost = fluid.layers.cross_entropy(input=predict, label=label)
+            import paddle.fluid as fluid
+            class_num = 7
+            x = fluid.layers.data(name='x', shape=[3, 10], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            predict = fluid.layers.fc(input=x, size=class_num, act='softmax')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
     """
     if not soft_label:
         return cross_entropy2(input, label, ignore_index)
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 9d5ed2e6d99..941ace24b36 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -23,51 +23,51 @@ __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 def create_lod_tensor(data, recursive_seq_lens, place):
     """
-    Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+    Create a LoDTensor from a numpy array, list or existing LoDTensor.
 
-    Create a lod tensor by doing the following:
+    The implementation is as follows:
 
-    1. Check that the length-based level of detail (LoD) also known as
-       recursive_sequence_lengths of the input is valid.
+    1. Check whether the length-based LoD, i.e., :code:`recursive_seq_lens`
+       is valid.
 
-    2. Convert recursive_sequence_lengths to a offset-based LoD.
+    2. Convert :code:`recursive_seq_lens` to a offset-based LoD.
 
-    3. Copy the data from a numpy array, a list or a existing lod tensor to
-       CPU or GPU device (based on input place).
+    3. Based on :code:`place` , copy the :code:`data` from a numpy array, list
+       or existing LoDTensor to CPU or GPU device.
 
-    4. Set the level of detail (LoD) using the offset-based LoD.
+    4. Set offset-based LoD to the output LoDTensor.
 
-    Examples:
+    Suppose we want to create a LoDTensor to hold data for word sequences,
+    where each word is represented by an integer. If we want to create
+    a LoDTensor to represent two sentences, one of 2 words, and one of 3 words.
 
-        Suppose we want LoDTensor to hold data for sequences of word, where each
-        word is represented by an integer. If we want to create a LoDTensor to
-        represent two sentences, one of 2 words, and one of 3 words.
+    Then :code:`data` would be a numpy array of integers with shape (5, 1).
+    :code:`recursive_seq_lens` would be [[2, 3]], indicating the word number
+    in each sentence. This length-based :code:`recursive_seq_lens` [[2, 3]]
+    would be converted to offset-based LoD [[0, 2, 5]] inside the function
+    call.
 
-        Then :code:`data` can be a numpy array of integers with shape (5, 1).
-        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
-        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
-        offset-based LoD [[0, 2, 5]] inside the function call.
+    Please reference :ref:`user_guide_lod_tensor` for more details regarding LoD.
 
-        .. code-block:: python
+    Args:
+        data (numpy.ndarray|list|LoDTensor): a numpy array, a list or ad LoDTensor
+                holding the data to be copied.
+        recursive_seq_lens (list[list[int]]): a list of lists indicating the
+                length-based LoD info.
+        place (CPUPlace|CUDAPlace): CPU or GPU place indicating where the data
+                in the created LoDTensor will be stored.
 
-          import paddle.fluid as fluid
-          import numpy as np
+    Returns:
+         A LoDTensor with tensor data and recursive_seq_lens info.
 
-          t = fluid.create_lod_tensor(np.ndarray([5, 30]), [[2, 3]], fluid.CPUPlace())
+    Examples:
 
-    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
-    regarding LoD.
+        .. code-block:: python
 
-    Args:
-        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
-            list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
-            info specified by the user.
-        place(Place): CPU or GPU place indicating where the data in the new
-            LoDTensor will be stored.
+            import paddle.fluid as fluid
+            import numpy as np
 
-    Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
+            t = fluid.create_lod_tensor(np.ndarray([5, 30]), [[2, 3]], fluid.CPUPlace())
     """
     if isinstance(data, core.LoDTensor):
         return create_lod_tensor(np.array(data), recursive_seq_lens, place)
@@ -116,47 +116,47 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
     """
     Create a LoDTensor containing random integers.
 
-    This function is frequently used in the book examples. So we revised it
-    based on the new create_lod_tensor API and put it here in the lod_tensor
-    module to simplify the code.
-
-    The function does the following:
-
-    1. Calculate the overall shape of the LoDTensor based on the length-based
-       :code:`recursive_seq_lens` input and the shape of the basic element in
-       :code:`base_shape`.
+    The implementation is as follows:
 
-    2. Create a numpy array of this shape.
+    1. Obtain the shape of output LoDTensor based on :code:`recursive_seq_lens`
+       and :code:`base_shape` . The first dimension of the shape is the total
+       length of sequences, while the other dimensions are the same as
+       :code:`base_shape` .
 
-    3. Create the LoDTensor using create_lod_tensor API.
+    2. Create a numpy array of random integers, and parse the created numpy
+       array as parameter :code:`data` of :ref:`api_fluid_create_lod_tensor` to
+       create the output LoDTensor.
 
-    Suppose we want LoDTensor to hold data for sequences of word, where each
-    word is represented by an integer. If we want to create a LoDTensor to
-    represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
-    for two sentences.
+    Suppose we want to create a LoDTensor to hold data for 2 sequences, where
+    the dimension of the sequences are [2, 30] and [3, 30] respectively.
+    The :code:`recursive_seq_lens` would be [[2, 3]], and :code:`base_shape`
+    would be [30] (the other dimensions excluding the sequence length).
+    Therefore, the shape of the output LoDTensor would be [5, 30], where
+    the first dimension 5 is the total lengths of the sequences, and the
+    other dimensions are :code:`base_shape`.
 
     Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based
-            level of detail info specified by the user.
-        base_shape(list): the shape of the basic element to be held by the
-            LoDTensor.
-        place(Place): CPU or GPU place indicating where the data in the new
-            LoDTensor will be stored.
-        low(int): the lower bound of the random integers.
-        high(int): the upper bound of the random integers.
+        recursive_seq_lens (list[list[int]]): a list of lists indicating the
+                length-based LoD info.
+        base_shape (list[int]): the shape of the output LoDTensor excluding
+                the first dimension.
+        place (CPUPlace|CUDAPlace): CPU or GPU place indicating where
+                the data in the created LoDTensor will be stored.
+        low (int): the lower bound of the random integers.
+        high (int): the upper bound of the random integers.
 
     Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
+        A LoDTensor with tensor data and recursive_seq_lens info, whose data
+        is inside [low, high].
 
     Examples:
         .. code-block:: python
 
           import paddle.fluid as fluid
 
-          t = fluid.create_random_int_lodtensor(recursive_seq_lens=[[2, 3]], 
-                base_shape=[30], place=fluid.CPUPlace(), low=0, high=10)
+          t = fluid.create_random_int_lodtensor(recursive_seq_lens=[[2, 3]],
+                    base_shape=[30], place=fluid.CPUPlace(), low=0, high=10)
+          print(t.shape()) # [5, 30]
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 3feff3a1d97..3758eb5d43d 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1782,26 +1782,29 @@ class DecayedAdagradOptimizer(Optimizer):
 
 class AdadeltaOptimizer(Optimizer):
     """
-    **NOTES: This API does not support sparse parameter optimization.**
+    **Notes: This API does not support sparse parameter optimization.**
 
     Adadelta Optimizer. Please refer to this for details:
-    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
-    <https://arxiv.org/abs/1212.5701>`_.
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
 
     .. math::
 
-        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2\\
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
 
-        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }\\
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
 
-        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
 
     Args:
-        learning_rate(float|Variable): global learning rate.
-        epsilon(float): a small float number for numeric stability. Default 1.0e-6.
-        rho(float): a floating point value indicating the decay rate.
-        regularization(WeightDecayRegularizer, optional): A Regularizer, such as fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no regularization.
-        name(str, optional): A optional name prefix for debugging. Default None.
+        learning_rate (float|Variable): global learning rate.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        regularization (WeightDecayRegularizer, optional): A Regularizer, such as
+                fluid.regularizer.L2DecayRegularizer. Default None, meaning that there is no
+                regularization.
+        name (str, optional): A optional name prefix for debugging. Default None.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 028aada68cd..ebf0f2e0cbc 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -27,23 +27,26 @@ __all__ = [
 
 class ParamAttr(object):
     """
-    Parameter attributes object. To fine-tuning network training process, user
-    can set parameter's attributes to control training details. Such as learning rate,
-    regularization, trainable, do_model_average and the method to initialize param.
-
-
-    Args:
-        name(str): The parameter's name. Default None.
-        initializer(Initializer): The method to initial this parameter. Default None.
-        learning_rate(float): The parameter's learning rate. The learning rate when
-            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
-            Default 1.0.
-        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
-        trainable(bool): Whether this parameter is trainable. Default True.
-        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
-            gradient. Default None.
-        do_model_average(bool): Whether this parameter should do model average 
-            when model average is enabled. Default True.
+    Create a object to represent the attribute of parameter. The attributes are:
+    name, initializer, learning rate, regularizer, trainable, gradient clip,
+    and model average.
+
+    Parameters:
+        name (str, optional): The parameter's name. Default None, meaning that the name
+                would be created automatically.
+        initializer (Initializer, optional): The method to initial this parameter. Default
+                None, meaning that the weight parameter is initialized by Xavier initializer,
+                and the bias parameter is initialized by 0.
+        learning_rate (float): The parameter's learning rate. The learning rate when
+                optimize is the global learning rates times the parameter's learning rate times
+                the factor of learning rate scheduler. Default 1.0.
+        regularizer (WeightDecayRegularizer, optional): Regularization factor. Default None, meaning
+                there is no regularization.
+        trainable (bool): Whether this parameter is trainable. Default True.
+        gradient_clip (BaseGradientClipAttr, optional): The method to clip this parameter's
+                gradient. Default None, meaning that there is no gradient clip.
+        do_model_average (bool): Whether this parameter should do model average
+                when model average is enabled. Default False.
 
     Examples:
         .. code-block:: python
@@ -54,6 +57,7 @@ class ParamAttr(object):
                                             learning_rate=0.5,
                                             regularizer=fluid.regularizer.L2Decay(1.0),
                                             trainable=True)
+            print(w_param_attrs.name) # "fc_weight"
             x = fluid.layers.data(name='X', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
     """
-- 
GitLab