update English Documents, test=release/1.6, test=document_fix (#20482)

be52f333 · DuYao · hong · 534cf892 · be52f333 · be52f333
4 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -199,7 +199,7 @@ paddle.fluid.layers.lod_append (ArgSpec(args=['x', 'level'], varargs=None, keywo
 paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', 'fa565b65fb98d3ca82361c79f41b06b2'))
 paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '46b3ada86dd2c79042dca90a55e08f66'))
 paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '89aa122a50dc20ee116ae49d66854d20'))
-paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf'))
+paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '70b6f4ab59e60650231b1ead4ad46222'))
 paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', '6fc9bae94518bbf3e1a9e479f38f6537'))
 paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '3885fd76e122ac0563fa8369bcab7363'))
 paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-05, None)), ('document', '08d94daffbea3935178810bdc1633f07'))
@@ -604,7 +604,7 @@ paddle.fluid.dygraph.Conv2D.set_dict (ArgSpec(args=['self', 'stat_dict', 'includ
 paddle.fluid.dygraph.Conv2D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.Conv2D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.Conv2D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3D ('paddle.fluid.dygraph.nn.Conv3D', ('document', '50412bd3fbf3557a8ef48e25c6517025'))
+paddle.fluid.dygraph.Conv3D ('paddle.fluid.dygraph.nn.Conv3D', ('document', 'f81dee6781d6c18d0e7f5ca66b2fb010'))
 paddle.fluid.dygraph.Conv3D.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Conv3D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.Conv3D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -689,7 +689,7 @@ paddle.fluid.dygraph.Embedding.set_dict (ArgSpec(args=['self', 'stat_dict', 'inc
 paddle.fluid.dygraph.Embedding.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.Embedding.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.Embedding.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.GRUUnit ('paddle.fluid.dygraph.nn.GRUUnit', ('document', '389e860e455b67aab1f4d472ac9d7e49'))
+paddle.fluid.dygraph.GRUUnit ('paddle.fluid.dygraph.nn.GRUUnit', ('document', 'f0e648f0a8d3389f755698dde488dc93'))
 paddle.fluid.dygraph.GRUUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.GRUUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.GRUUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -757,7 +757,7 @@ paddle.fluid.dygraph.PRelu.set_dict (ArgSpec(args=['self', 'stat_dict', 'include
 paddle.fluid.dygraph.PRelu.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.PRelu.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.PRelu.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.BilinearTensorProduct ('paddle.fluid.dygraph.nn.BilinearTensorProduct', ('document', 'be70d0f6d43729d9cb80c9a34ed5f26b'))
+paddle.fluid.dygraph.BilinearTensorProduct ('paddle.fluid.dygraph.nn.BilinearTensorProduct', ('document', 'ddea5bc0668a636ded7db09538511c20'))
 paddle.fluid.dygraph.BilinearTensorProduct.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'name', 'act', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.BilinearTensorProduct.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.BilinearTensorProduct.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -791,7 +791,7 @@ paddle.fluid.dygraph.Conv2DTranspose.set_dict (ArgSpec(args=['self', 'stat_dict'
 paddle.fluid.dygraph.Conv2DTranspose.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '9d689f44592cd22812c7ec06a9654eac'))
 paddle.fluid.dygraph.Conv2DTranspose.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62'))
 paddle.fluid.dygraph.Conv2DTranspose.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.Conv3DTranspose ('paddle.fluid.dygraph.nn.Conv3DTranspose', ('document', '91ba132bc690eaf76eabdbde8f87e4a0'))
+paddle.fluid.dygraph.Conv3DTranspose ('paddle.fluid.dygraph.nn.Conv3DTranspose', ('document', '0ef981fd6a74aaff21673f9925736ac7'))
 paddle.fluid.dygraph.Conv3DTranspose.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.Conv3DTranspose.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1'))
 paddle.fluid.dygraph.Conv3DTranspose.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995'))
@@ -870,31 +870,31 @@ paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, key
 paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.save_dygraph (ArgSpec(args=['state_dict', 'model_path'], varargs=None, keywords=None, defaults=None), ('document', '7c2bd58a69f9bca3b884f44154c84569'))
 paddle.fluid.dygraph.load_dygraph (ArgSpec(args=['model_path'], varargs=None, keywords=None, defaults=None), ('document', 'd6d98002c39d2484835f4748e35b761c'))
-paddle.fluid.dygraph.NoamDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NoamDecay', ('document', '9ccfea97dbf15134d406a23aae1e1fa2'))
+paddle.fluid.dygraph.NoamDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NoamDecay', ('document', '3441619381487db8d1929a205f3c6d41'))
 paddle.fluid.dygraph.NoamDecay.__init__ (ArgSpec(args=['self', 'd_model', 'warmup_steps', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.NoamDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.NoamDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PiecewiseDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PiecewiseDecay', ('document', '8f4d37eaad4e2f5b12850f3663856758'))
+paddle.fluid.dygraph.PiecewiseDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PiecewiseDecay', ('document', '0fccf303b94a13ae670fb3dd51931f73'))
 paddle.fluid.dygraph.PiecewiseDecay.__init__ (ArgSpec(args=['self', 'boundaries', 'values', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.PiecewiseDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.PiecewiseDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.NaturalExpDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NaturalExpDecay', ('document', '94bed58b392a5a71b6d1abd39eed7111'))
+paddle.fluid.dygraph.NaturalExpDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NaturalExpDecay', ('document', '5fef27468d49ca8ca6c6a9635ad0f5c1'))
 paddle.fluid.dygraph.NaturalExpDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.NaturalExpDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.NaturalExpDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.ExponentialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.ExponentialDecay', ('document', 'a259689c649c5f82636536386ce2ef19'))
+paddle.fluid.dygraph.ExponentialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.ExponentialDecay', ('document', '846eb564df136d8a8917bf16b5b8ac9b'))
 paddle.fluid.dygraph.ExponentialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.ExponentialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.ExponentialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.InverseTimeDecay ('paddle.fluid.dygraph.learning_rate_scheduler.InverseTimeDecay', ('document', '6a868b2c7cc0f09f57ef71902bbc93ca'))
+paddle.fluid.dygraph.InverseTimeDecay ('paddle.fluid.dygraph.learning_rate_scheduler.InverseTimeDecay', ('document', '1a74f0370e2e64f9e786d3c336526e6d'))
 paddle.fluid.dygraph.InverseTimeDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.InverseTimeDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.InverseTimeDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.PolynomialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PolynomialDecay', ('document', 'bb90314cee58952f13522dcd571ca832'))
+paddle.fluid.dygraph.PolynomialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PolynomialDecay', ('document', 'e222a066a2bcf31bc52a14271048e034'))
 paddle.fluid.dygraph.PolynomialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.PolynomialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.PolynomialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph.CosineDecay ('paddle.fluid.dygraph.learning_rate_scheduler.CosineDecay', ('document', '46dadadee1a8a92d70bd277d9345bfb0'))
+paddle.fluid.dygraph.CosineDecay ('paddle.fluid.dygraph.learning_rate_scheduler.CosineDecay', ('document', '0d7fe2b87492a0eb5cde60dbe268ea17'))
 paddle.fluid.dygraph.CosineDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'step_each_epoch', 'epochs', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.dygraph.CosineDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866'))
 paddle.fluid.dygraph.CosineDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -69,30 +69,33 @@ class LearningRateDecay(object):

 class PiecewiseDecay(LearningRateDecay):
    """
-    piecewise decay scheduler
+    Piecewise decay scheduler.

    The algorithm can be described as the code below.

    .. code-block:: text

-      boundaries = [10000, 20000]
-      values = [1.0, 0.5, 0.1]
-      if step < 10000:
-          learning_rate = 1.0
-      elif 10000 <= step < 20000:
-          learning_rate = 0.5
-      else:
-          learning_rate = 0.1
-    Args:
-        boundaries: A list of steps numbers.
-        values: A list of learning rate values that will be picked during
-            different step boundaries.
-        begin: The begin step to initilize the self.step_num
-        step: The step_size using when calculate the new step_num (Defalult is 1)
-        dtype: The dtype used to create the learning rate variable
+        boundaries = [10000, 20000]
+        values = [1.0, 0.5, 0.1]
+        if global_step < 10000:
+            learning_rate = 1.0
+        elif 10000 <= global_step < 20000:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+
+    Parameters:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
+        values(list): A list of learning rate values that will be picked during
+            different step boundaries. The type of element in the list is python float.
+        begin(int): The begin step to initilize the global_step in the description above.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
        .. code-block:: python
@@ -125,25 +128,40 @@ class NaturalExpDecay(LearningRateDecay):
    """
    Applies natural exponential decay to the initial learning rate.
    
-    .. code-block:: python
+    The algorithm can be described as following.

-        if not staircase:
-            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-        else:
-            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-
-    Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
-        begin: A Python 'int32' number, the begin step (Default is 0)
-        step: A Python 'int32' number, the step size (Default is 1)
-        dtype: A Python 'str', the dtype used to create learning rate variable (Default is 'float32')
+    .. math::
+
+        decayed\_learning\_rate = learning\_rate * e^{y} 
+
+    If staircase is set to False, then:
+
+    .. math::
+
+        y = - decay\_rate * \\frac{global\_step}{decay\_steps}
+
+    If staircase is set to True, then:
+
+    .. math::
+
+        y = - decay\_rate * math.floor(\\frac{global\_step}{decay\_steps}) 
+
+    Parameters:
+        learning_rate(Variable|float): The initial learning rate. If the type 
+            is Variable, it's a tensor with shape [1], the data type can be  
+            float32 or float64. It also can be set to python int number.
+        decay_steps(int): The decay step size. It determines the decay cycle.
+        decay_rate(int): The decay rate.
+        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The 
+            default value is False.
+        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
        .. code-block:: python
@@ -189,29 +207,41 @@ class ExponentialDecay(LearningRateDecay):
    """
    Applies exponential decay to the learning rate.

-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    'decay_rate' every 'decay_steps' steps.
+    The algorithm can be described as following.
    
-    .. code-block:: python
+    .. math::

-        if staircase == True:
-            decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
-        else:
-            decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
+        decayed\_learning\_rate = learning\_rate * decay\_rate ^ y 
+
+    If staircase is set to False, then:
+
+    .. math::
+
+        y = \\frac{global\_step}{decay\_steps} 
+
+    If staircase is set to True, then:
+
+    .. math::
+
+        y = math.floor(\\frac{global\_step}{decay\_steps})
+
+
+    Parameters:
+        learning_rate(Variable|float): The initial learning rate. If the type 
+            is Variable, it's a tensor with shape [1], the data type can be  
+            float32 or float64. It also can be set to python int number.
+        decay_steps(int): The decay step size. It determines the decay cycle.
+        decay_rate(float): The decay rate.
+        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The 
+            default value is False.
+        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
        .. code-block:: python
@@ -257,27 +287,35 @@ class InverseTimeDecay(LearningRateDecay):
    """
    Applies inverse time decay to the initial learning rate.

-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, an inverse decay function will be
-    applied to the initial learning rate.
-
-    >>> if staircase == True:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    >>> else:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
+    The algorithm can be described as following.
+    If staircase is set to False, then:
+
+    .. math::
+
+        decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * \\frac{global\_step}{decay\_step}}  
+
+    If staircase is set to True, then:
+
+    .. math::
+
+        decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * math.floor(\\frac{global\_step}{decay\_step})}
+
+    Parameters:
+        learning_rate(Variable|float): The initial learning rate. If the type 
+            is Variable, it's a tensor with shape [1], the data type can be  
+            float32 or float64. It also can be set to python int number.
+        decay_steps(int): The decay step size. It determines the decay cycle.
+        decay_rate(float): The decay rate.
+        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The 
+            default value is False.
+        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be 
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
        .. code-block:: python
@@ -323,28 +361,40 @@ class PolynomialDecay(LearningRateDecay):
    """
    Applies polynomial decay to the initial learning rate.

-    .. code-block:: text
+    The algorithm can be described as following.
+
+    If cycle is set to True, then:
+
+    .. math::
+
+        decay\_steps & = decay\_steps * math.ceil(\\frac{global\_step}{decay\_steps}) 

-     if cycle:
-       decay_steps = decay_steps * ceil(global_step / decay_steps)
-     else:
-       global_step = min(global_step, decay_steps)
-       decayed_learning_rate = (learning_rate - end_learning_rate) *
-            (1 - global_step / decay_steps) ^ power + end_learning_rate
-
-    Args:
-        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps(int32): A Python `int32` number.
-        end_learning_rate(float): A Python `float` number.
-        power(float): A Python `float` number.
-        cycle(bool): If set true, decay the learning rate every decay_steps.
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
+        decayed\_learning\_rate & = (learning\_rate-end\_learning\_rate)*(1-\\frac{global\_step}{decay\_steps})^{power}+end\_learning\_rate
+
+    If cycle is set to False, then:
+
+    .. math::
+
+        global\_step & = min(global\_step, decay\_steps) 
+
+        decayed\_learning\_rate & = (learning\_rate-end\_learning\_rate)*(1-\\frac{global\_step}{decay\_steps})^{power}+end\_learning\_rate
+
+    Parameters:
+        learning_rate(Variable|float): The initial learning rate. If the type 
+            is Variable, it's a tensor with shape [1], the data type can be  
+            float32 or float64. It also can be set to python int number.
+        decay_steps(int32): The decay step size. It determines the decay cycle.
+        end_learning_rate(float, optional): The minimum final learning rate. The default value is 0.0001.
+        power(float, optional): Power of polynomial. The default value is 1.0.
+        cycle(bool, optional): If set true, decay the learning rate every decay_steps. The default value is False.
+        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
        .. code-block:: python
@@ -401,24 +451,26 @@ class CosineDecay(LearningRateDecay):
    """
    Applies cosine decay to the learning rate.

-    when training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    following cosine decay strategy.
+    The algorithm can be described as following.

    .. math::

-	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
+        decayed\_learning\_rate = learning\_rate * 0.5 * (math.cos(global\_step * \\frac{math.pi}{step\_each\_epoch} ) + 1)
    
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        step_each_epoch(int): the number of steps in an epoch.
-        epochs(int): the number of epochs.
-        begin(int): The begin step (default is 0).
-        step(int): The step size (default is 1).
-        dtype(str): The dtype used to create learning rate (default is 'float32').
+    Parameters:
+        learning_rate(Variable|float): The initial learning rate. If the type 
+            is Variable, it's a tensor with shape [1], the data type can be  
+            float32 or float64. It also can be set to python int number.
+        step_each_epoch(int): The number of steps in an epoch.
+        epochs(int): The number of epochs.
+        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
 	.. code-block:: python
@@ -453,33 +505,29 @@ class CosineDecay(LearningRateDecay):

 class NoamDecay(LearningRateDecay):
    """
-    Noam decay method. The numpy implementation of noam decay as follows.
-
-    .. code-block:: python
-      
-      import numpy as np
-      # set hyper parameters
-      d_model = 2
-      current_steps = 20
-      warmup_steps = 200
-      # compute
-      lr_value = np.power(d_model, -0.5) * np.min([
-                              np.power(current_steps, -0.5),
-                              np.power(warmup_steps, -1.5) * current_steps])
-
-    Please reference `attention is all you need
-    <https://arxiv.org/pdf/1706.03762.pdf>`_.
-
-    Args:
-        d_model(Variable): The dimensionality of input and output of model.
-
-        warmup_steps(Variable): A super parameter.
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
+    Applies Noam decay to the initial learning rate. 
+
+    The algorithm can be described as following.
+
+    .. math::
+
+        decayed\_learning\_rate = d_{model}^{-0.5} * min(global\_step^{-0.5}, global\_step * warmup\_steps^{-1.5})
+
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+
+    Parameters:
+        d$_{model}$(Variable|int): The dimensionality of input and output feature vector of model. If type is Variable, 
+            it's a tensor with shape [1] and the data type can be int32 or int64. The type can also be python int.
+        warmup_steps(Variable|int): The number of warmup steps. A super parameter. If type is Variable, 
+            it's a tensor with shape [1] and the data type can be int32 or int64. The type can also be python int.
+        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
+        step(int, optional): The step size used to calculate the new global_step in the description above.
+            The defalult value is 1.
+        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
+            'float32', 'float64'. The default value is 'float32'.

    Returns:
-        The decayed learning rate.
+        None.

    Examples:
        .. code-block:: python

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8986,8 +8986,8 @@ def label_smooth(label,
                 dtype="float32",
                 name=None):
    """
-    Label smoothing is a mechanism to regularize the classifier layer and is
-    called label-smoothing regularization (LSR).
+    Label smoothing is a mechanism to regularize the classifier layer and is called 
+    label-smoothing regularization (LSR). 

    Label smoothing is proposed to encourage the model to be less confident,
    since optimizing the log-likelihood of the correct label directly may
@@ -9006,19 +9006,23 @@ def label_smooth(label,

    See more details about label smoothing in https://arxiv.org/abs/1512.00567.

-    Args:
+    Parameters:
        label(Variable): The input variable containing the label data. The
-                          label data should use one-hot representation.
-        prior_dist(Variable): The prior distribution to be used to smooth
-                              labels. If not provided, an uniform distribution
-                              is used. The shape of :attr:`prior_dist` should
-                              be :math:`(1, class\_num)`.
-        epsilon(float): The weight used to mix up the original ground-truth
-                        distribution and the fixed distribution.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32,
-                                                  float_64, int etc.
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+                        label data should use one-hot representation. It's 
+                        a multidimensional tensor with a shape of 
+                        :math:`[N_1, ..., Depth]`, where Depth is class number.
+        prior_dist(Variable, optional): The prior distribution to be used to smooth
+                        labels. If not provided, an uniform distribution
+                        is used. It's a multidimensional tensor with a shape of
+                        :math:`[1, class\_num]` . The default value is None.
+        epsilon(float, optional): The weight used to mix up the original ground-truth
+                        distribution and the fixed distribution. The default value is 
+                        0.1.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type can be set
+                        as 'float32', 'float64'. The default value is 'float32'.
+        name(str, optional): The default value is None. Normally there is no need for user 
+                        to set this property. For more information, please refer to 
+                        :ref:`api_guide_Name`.

    Returns:
        Variable: The tensor variable containing the smoothed labels.