API/OP (clip_by_norm/clip) error message enhancement (#23603)

* error message enhance for clip_by_norm. test=develop * fix clip_by_norm. test=develop * fix clip error message. test=develop

API/OP (clip_by_norm/clip) error message enhancement (#23603)
* error message enhance for clip_by_norm. test=develop * fix clip_by_norm. test=develop * fix clip error message. test=develop
52979565 · Yuan Shuai · GitHub · b4b6763a · 52979565 · 52979565
3 changed file
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -67,7 +67,10 @@ class ClipByNormKernel : public framework::OpKernel<T> {
                   framework::ToTypeName(in_var->Type()));
    }
-    PADDLE_ENFORCE_NOT_NULL(input);
+    PADDLE_ENFORCE_NOT_NULL(input,
+                            platform::errors::InvalidArgument(
+                                "Input(X) of ClipByNormOp should not be null. "
+                                "Please check if it is created correctly."));
    auto x = EigenVector<T>::Flatten(*input);
    auto out = EigenVector<T>::Flatten(*output);
@@ -89,12 +92,19 @@ class ClipByNormOp : public framework::OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                   "Input(X) of ClipByNormOp should not be null.");
+                      platform::errors::InvalidArgument(
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                          "Input(X) of ClipByNormOp should not be null. Please "
-                   "Output(Out) of ClipByNormOp should not be null.");
+                          "check if it is created correctly."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of ClipByNormOp should not be null. "
+                          "Please check if it is created correctly."));
    auto max_norm = ctx->Attrs().Get<float>("max_norm");
-    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    PADDLE_ENFORCE_GT(max_norm, 0, platform::errors::InvalidArgument(
+                                       "max_norm should be greater than 0. "
+                                       "Received max_norm is %f.",
+                                       max_norm));
    auto x_dims = ctx->GetInputDim("X");
    ctx->SetOutputDim("Out", x_dims);
    ctx->ShareLoD("X", /*->*/ "Out");

--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -23,14 +23,21 @@ class ClipOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
+    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                   "Input(X) of ClipOp should not be null.");
+                      platform::errors::InvalidArgument(
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                          "Input(X) of ClipOp should not be null. Please check "
-                   "Output(Out) of ClipOp should not be null.");
+                          "if it is created correctly."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of ClipOp should not be null. Please "
+                          "check if it is created correctly."));
    auto x_dims = ctx->GetInputDim("X");
    auto max = ctx->Attrs().Get<float>("max");
    auto min = ctx->Attrs().Get<float>("min");
-    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    PADDLE_ENFORCE_LT(min, max, platform::errors::InvalidArgument(
+                                    "Max of ClipOp should be greater than min. "
+                                    "Received max is %f, received min is %f.",
+                                    max, min));
    ctx->SetOutputDim("Out", x_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -52,7 +59,7 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Clip Operator.
-The clip operator limits the value of given input within an interval [min, max], 
+The clip operator limits the value of given input within an interval [min, max],
 just as the following equation,
 $$
@@ -68,9 +75,14 @@ class ClipOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE_EQ(
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+        ctx->HasInput("X"), true,
-                   "Input(Out@GRAD) should not be null");
+        platform::errors::InvalidArgument("Input(X) should not be null. Please "
+                                          "check if it is created correctly."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Out@GRAD) should not be null. Please check if "
+                          "it is created correctly."));
    auto x_dims = ctx->GetInputDim("X");
    if (ctx->HasOutput(framework::GradVarName("X"))) {
      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -381,7 +381,7 @@ def embedding(input,
    of output Tensor is generated by replacing the last dimension of the input Tensor shape
    with emb_size.
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , 
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
    otherwise the program will throw an exception and exit.
    .. code-block:: text
@@ -399,12 +399,12 @@ def embedding(input,
                        [[0.345249859, 0.124939536, ..., 0.194353745],
                        [0.945345345, 0.435394634, ..., 0.435345365]],
                        [[0.945345345, 0.435394634, ..., 0.435345365],
                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
        It will pad all-zero data when ids is 127.
        Case 2:
        input is a LoDTensor with 1-level LoD. padding_idx = 0
@@ -429,22 +429,22 @@ def embedding(input,
        size(tuple|list): The shape of lookup table parameter. It should have two elements which
            indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
        is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
-            affects the performance of the backwards gradient update. It is recommended to set 
+            affects the performance of the backwards gradient update. It is recommended to set
            True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` , 
+            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` ,
            :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
            :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
            In these case, is_sparse must be False. Default: False.
        is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
            in multi-machine distributed CPU training. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). 
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
            If set None, it makes no effect to output. Default: None.
        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
-            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. 
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
            The local word vector needs to be transformed into numpy format, and the shape of local word
            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
            is used to load custom or pre-trained word vectors. See code example 2 for details.
@@ -471,7 +471,7 @@ def embedding(input,
              learning_rate=0.5,
              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
              trainable=True)
-          emb_2 = fluid.layers.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')   
+          emb_2 = fluid.layers.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')
    """
    helper = LayerHelper('embedding', **locals())
@@ -652,11 +652,11 @@ def _pull_box_sparse(input, size, dtype='float32'):
    :attr:`input`.
    Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable, which 
+        input(Variable|list of Variable): Input is a Tensor<int64> Variable, which
            contains the IDs information.
-        size(int): The embedding size parameter, which indicates the size of 
+        size(int): The embedding size parameter, which indicates the size of
            each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports 
+        dtype(str): The dtype refers to the data type of output tensor. Only supports
 	    float32 now.
    Returns:
@@ -668,7 +668,7 @@ def _pull_box_sparse(input, size, dtype='float32'):
          import paddle.fluid as fluid
          data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
-          emb = fluid.layers.pull_box_sparse(input=data, size=[11])    
+          emb = fluid.layers.pull_box_sparse(input=data, size=[11])
    """
    helper = LayerHelper('pull_box_sparse', **locals())
    if dtype != 'float32':
@@ -699,7 +699,7 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
    ${comment}
    Args:
-        input(${emission_type}): ${emission_comment} 
+        input(${emission_type}): ${emission_comment}
        label(${label_type}): ${label_comment}
        Length(${length_type}): ${length_comment}
        param_attr(ParamAttr): The attribute of the learnable parameter for transition parameter.
@@ -727,17 +727,17 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
                    label=label,
                    param_attr=fluid.ParamAttr(
                    name='crfw',
-                    learning_rate=0.01)) 
+                    learning_rate=0.01))
            use_cuda = False
            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = fluid.Executor(place)
-            exe.run(startup_program)    
+            exe.run(startup_program)
            #define data, using LoDTensor
            a = fluid.create_lod_tensor(np.random.rand(12,10).astype('float32'), [[3,3,4,2]], place)
            b = fluid.create_lod_tensor(np.array([[1],[1],[2],[3],[1],[1],[1],[3],[1],[1],[1],[1]]),[[3,3,4,2]] , place)
            feed1 = {'input_data':a,'label':b}
            loss= exe.run(train_program,feed=feed1, fetch_list=[crf_cost])
-            print(loss) 
+            print(loss)
            #define net structure, using padding
            train_program = fluid.Program()
@@ -766,7 +766,7 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
            ll=np.array([[3],[3],[4],[2]])
            feed2 = {'input_data2':cc,'label2':dd,'length':ll}
            loss2= exe.run(train_program,feed=feed2, fetch_list=[crf_cost2])
-            print(loss2) 
+            print(loss2)
            #[array([[ 7.8902354],
            #        [ 7.3602567],
            #        [ 10.004011],
@@ -775,7 +775,7 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
            #you can use find_var to get transition parameter.
            transition=np.array(fluid.global_scope().find_var('crfw').get_tensor())
            print(transition)
    """
    helper = LayerHelper('linear_chain_crf', **locals())
    size = input.shape[2] if length else input.shape[1]
@@ -819,12 +819,12 @@ def crf_decoding(input, param_attr, label=None, length=None):
    Args:
        input(${emission_type}): ${emission_comment}
-        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
+        param_attr (ParamAttr|None): To specify the weight parameter attribute.
-            Default: None, which means the default weight parameter property is 
+            Default: None, which means the default weight parameter property is
            used. See usage for details in :ref:`api_fluid_ParamAttr` .
        label(${label_type}, optional): ${label_comment}
        length(${length_type}, optional): ${length_comment}
    Returns:
@@ -840,10 +840,10 @@ def crf_decoding(input, param_attr, label=None, length=None):
           feature = fluid.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
           label = fluid.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
           emission = fluid.layers.fc(input=feature, size=num_labels)
-           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label, 
+           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label,
                     param_attr=fluid.ParamAttr(name="crfw"))
-           crf_decode = fluid.layers.crf_decoding(input=emission, 
+           crf_decode = fluid.layers.crf_decoding(input=emission,
                     param_attr=fluid.ParamAttr(name="crfw"))
           # Common tensor example
@@ -853,8 +853,8 @@ def crf_decoding(input, param_attr, label=None, length=None):
           length = fluid.data(name='length', shape=[-1, 1], dtype='int64')
           emission = fluid.layers.fc(input=feature, size=num_labels,
                                      num_flatten_dims=2)
-           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label, length=length, 
+           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label, length=length,
                     param_attr=fluid.ParamAttr(name="crfw_pad"))
           crf_decode = fluid.layers.crf_decoding(input=emission, length=length,
                     param_attr=fluid.ParamAttr(name="crfw_pad"))
@@ -1166,7 +1166,7 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
    This operator implements the softmax layer. The calculation process is as follows:
    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
    2. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
    second dimension(row length) is the same as the dimension :attr:`axis` of the input
    tensor, and the first dimension(column length) is the product of all other
@@ -1175,7 +1175,7 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
    K-dimensional vector of real values in the range [0, 1] that add up to 1.
-    3. After the softmax operation is completed, the inverse operations of steps 1 and 2 
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
    are performed to restore the two-dimensional matrix to the same dimension as the ``input``.
    It computes the exponential of the given dimension and the sum of exponential
@@ -1235,7 +1235,7 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
-                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]] 
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
    Args:
        input (Variable): The input variable. A multi-dimension ``Tensor`` with type float32 or float64.
@@ -1353,30 +1353,30 @@ def conv2d(input,
            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
    Args:
-        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type 
+        input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type
            of input is float16 or float32 or float64.
        num_filters(int): The number of filter. It is as same as the output
            image channel.
-        filter_size (int|tuple): The filter size. If filter_size 
+        filter_size (int|tuple): The filter size. If filter_size
-            is a tuple, it must contain two integers, (filter_size_height, 
+            is a tuple, it must contain two integers, (filter_size_height,
            filter_size_width). Otherwise, filter_size_height = filter_size_width =\
            filter_size.
-        stride (int|tuple): The stride size. It means the stride in convolution. 
+        stride (int|tuple): The stride size. It means the stride in convolution.
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
            on both sides for each dimension.If `padding` is a string, either 'VALID' or
            'SAME' which is the padding algorithm. If padding size is a tuple or list,
            it could be in three forms: `[pad_height, pad_width]` or
-            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
+            `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when
-            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
+            `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0],
            [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
        dilation (int|tuple): The dilation size. It means the spacing between the kernel
-            points. If dilation is a tuple, it must contain two integers, (dilation_height, 
+            points. If dilation is a tuple, it must contain two integers, (dilation_height,
-            dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_height = dilation_width = dilation.
            Default: dilation = 1.
        groups (int): The groups number of the Conv2d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -1397,18 +1397,18 @@ def conv2d(input,
            library is installed. Default: True
        act (str): Activation type, if it is set to None, activation is not appended.
            Default: None
-        name(str|None): For detailed information, please refer 
+        name(str|None): For detailed information, please refer
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           to :ref:`api_guide_Name`. Usually name is no need to set and
           None by default.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
    Returns:
-        A Variable holding Tensor representing the conv2d, whose data type is the 
+        A Variable holding Tensor representing the conv2d, whose data type is the
-        same with input. If act is None, the tensor variable storing the convolution 
+        same with input. If act is None, the tensor variable storing the convolution
-        result, and if act is not None, the tensor variable storing convolution 
+        result, and if act is not None, the tensor variable storing convolution
        and non-linearity activation result.
    Raises:
@@ -1416,7 +1416,7 @@ def conv2d(input,
        ValueError: If `data_format` is not "NCHW" or "NHWC".
        ValueError: If the channel dimmention of the input is less than or equal to zero.
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
            or the element corresponding to the input's channel is not 0.
        ShapeError: If the input is not 4-D Tensor.
        ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -1621,18 +1621,18 @@ def conv3d(input,
            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
    Args:
-        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data 
+        input (Variable): The input is 5-D Tensor with shape [N, C, D, H, W], the data
            type of input is float16 or float32 or float64.
        num_filters(int): The number of filter. It is as same as the output
            image channel.
        filter_size (int|tuple): The filter size. If filter_size is a tuple,
-            it must contain three integers, (filter_size_depth, filter_size_height, 
+            it must contain three integers, (filter_size_depth, filter_size_height,
            filter_size_width). Otherwise, filter_size_depth = filter_size_height = \
            filter_size_width = filter_size.
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
+        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+            tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
            on both sides for each dimension. If `padding` is a string, either 'VALID' or
            'SAME' which is the padding algorithm. If padding size is a tuple or list,
            it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -1642,9 +1642,9 @@ def conv3d(input,
            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|tuple): The dilation size. It means the spacing between the kernel points.
            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
            Default: dilation = 1.
        groups (int): The groups number of the Conv3d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -1665,18 +1665,18 @@ def conv3d(input,
            library is installed. Default: True
        act (str): Activation type, if it is set to None, activation is not appended.
            Default: None.
-        name(str|None): For detailed information, please refer 
+        name(str|None): For detailed information, please refer
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           to :ref:`api_guide_Name`. Usually name is no need to set and
           None by default.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
    Returns:
-        A Variable holding Tensor representing the conv3d, whose data type is 
+        A Variable holding Tensor representing the conv3d, whose data type is
-        the same with input. If act is None, the tensor variable storing the 
+        the same with input. If act is None, the tensor variable storing the
-        convolution result, and if act is not None, the tensor variable storing 
+        convolution result, and if act is not None, the tensor variable storing
        convolution and non-linearity activation result.
    Raises:
@@ -1684,7 +1684,7 @@ def conv3d(input,
        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
        ValueError: If the channel dimmention of the input is less than or equal to zero.
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
            or the element corresponding to the input's channel is not 0.
        ShapeError: If the input is not 5-D Tensor.
        ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -2062,8 +2062,8 @@ def pool3d(input,
                          the number of channels, `D` is the depth of the feature,
                          `H` is the height of the feature, and `W` is the width
                          of the feature.
-        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size
-            is a tuple or list, it must contain three integers, 
+            is a tuple or list, it must contain three integers,
            (pool_size_Depth, pool_size_Height, pool_size_Width).
            Otherwise, the pool kernel size will be the cube of an int.
        pool_type (string): ${pooling_type_comment}
@@ -2308,7 +2308,7 @@ def adaptive_pool2d(input,
                             None by default.
    Returns:
-        Variable: The output tensor of adaptive pooling result. The data type is same 
+        Variable: The output tensor of adaptive pooling result. The data type is same
                  as input tensor.
    Raises:
@@ -2441,7 +2441,7 @@ def adaptive_pool3d(input,
      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
    Args:
-        input (Variable): The input tensor of pooling operator, which is a 5-D tensor with 
+        input (Variable): The input tensor of pooling operator, which is a 5-D tensor with
                          shape [N, C, D, H, W]. The format of input tensor is NCDHW, where
                          N is batch size, C is the number of channels, D is the depth of the feature,
                          H is the height of the feature, and W is the width of the feature.
@@ -2609,7 +2609,7 @@ def batch_norm(input,
        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
        moving\_mean = moving\_mean * momentum + mini-batch\_mean * (1. - momentum) \\\\
-        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum) 
+        moving\_var = moving\_var * momentum + mini-batch\_var * (1. - momentum)
    moving_mean is global mean and moving_var is global variance.
@@ -2627,12 +2627,12 @@ def batch_norm(input,
        y_i &\\gets \\gamma \\hat{x_i} + \\beta
    Note:
-        if build_strategy.sync_batch_norm=True, the batch_norm in network will use 
+        if build_strategy.sync_batch_norm=True, the batch_norm in network will use
        sync_batch_norm automatically.
        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
    Args:
-        input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type 
+        input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type
            is float16 or float32 or float64.
        act(string, Default None): Activation type, linear|relu|prelu|...
        is_test (bool, Default False): A flag indicating whether it is in
@@ -2648,25 +2648,25 @@ def batch_norm(input,
        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
 	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
+	     If the Initializer of the param_attr is not set, the parameter is initialized
 	     with Xavier. Default: None.
        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
             If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
 	     Default: None.
-        data_layout (str, optional): Specify the data format of the input, and the data format of the output 
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
-        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. 
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default. 
+            Usually name is no need to set and None by default.
-        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it 
+        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it
-            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm 
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
            will save global mean with the string.
        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
-            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm 
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
            will save global variance with the string.
        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
            average when model average is enabled.
@@ -2676,8 +2676,8 @@ def batch_norm(input,
            In train mode, when setting use_global_stats True, the global mean
            and variance are also used during train period.
    Returns:
-        A Variable holding Tensor which is the result after applying batch normalization on the input, 
+        A Variable holding Tensor which is the result after applying batch normalization on the input,
-        has same shape and data type with input. 
+        has same shape and data type with input.
    Examples:
@@ -2842,10 +2842,10 @@ def inplace_abn(input,
                act_alpha=1.0):
    """
    **In-place Activation Batch Normalization Layer**
    This layer calculates batch normalization and activation with in-place memory.
    For batch normalization calculations, see `fluid.layers.batch_norm`.
-    For in-place activation batch normalization, see `In-Place Activated BatchNorm for 
+    For in-place activation batch normalization, see `In-Place Activated BatchNorm for
    Memory-Optimized Training of DNNs <https://arxiv.org/abs/1712.02616>`_
    `inplace_abn` only support activation type as `None`, `identity`, `leaky_relu`,
@@ -2853,12 +2853,12 @@ def inplace_abn(input,
    `inplace_abn` only support data type as `float32`, `float64` currently.
    Note:
-        if build_strategy.sync_batch_norm=True, the batch_norm in network will use 
+        if build_strategy.sync_batch_norm=True, the batch_norm in network will use
        sync_batch_norm automatically.
        `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`.
    Args:
-        input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type 
+        input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type
            is float16 or float32 or float64.
        act(string, Default None): Activation type, linear|relu|prelu|...
        is_test (bool, Default False): A flag indicating whether it is in
@@ -2872,26 +2872,26 @@ def inplace_abn(input,
        epsilon(float, Default 1e-05): A value added to the denominator for
            numerical stability. Default is 1e-5.
        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
-             of inplace_abn. If it is set to None or one attribute of ParamAttr, inplace_abn 
+             of inplace_abn. If it is set to None or one attribute of ParamAttr, inplace_abn
 	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
+	     If the Initializer of the param_attr is not set, the parameter is initialized
 	     with Xavier. Default: None.
        bias_attr(ParamAttr|None): The parameter attribute for the bias of inplace_abn.
-             If it is set to None or one attribute of ParamAttr, inplace_abn 
+             If it is set to None or one attribute of ParamAttr, inplace_abn
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
 	     Default: None.
-        data_layout (str, optional): Specify the data format of the input, and the data format of the output 
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
-        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. 
+        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default. 
+            Usually name is no need to set and None by default.
-        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it 
+        moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it
-            is set to None, inplace_abn will save global mean with a random name, otherwise, inplace_abn 
+            is set to None, inplace_abn will save global mean with a random name, otherwise, inplace_abn
            will save global mean with the string.
        moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance.
-            If it is set to None, inplace_abn, will save global variance with a random name, otherwise, inplace_abn 
+            If it is set to None, inplace_abn, will save global variance with a random name, otherwise, inplace_abn
            will save global variance with the string.
        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model
            average when model average is enabled.
@@ -2904,8 +2904,8 @@ def inplace_abn(input,
            inplace activative batch normalization will be used, and alpha parameter for activation
            can be given by this parameter.
    Returns:
-        A Variable holding Tensor which is the result after applying batch normalization and activation on the input, 
+        A Variable holding Tensor which is the result after applying batch normalization and activation on the input,
-        has same shape and data type with input. 
+        has same shape and data type with input.
    Examples:
@@ -3042,7 +3042,7 @@ def instance_norm(input,
    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
-    Refer to `Instance Normalization: The Missing Ingredient for 
+    Refer to `Instance Normalization: The Missing Ingredient for
    Fast Stylization <https://arxiv.org/pdf/1607.08022.pdf>`_
    for more details.
@@ -3062,26 +3062,26 @@ def instance_norm(input,
        `H` means height of feature map, `W` means width of feature map.
    Args:
-        input(variable): The rank of input variable can be 2, 3, 4, 5. 
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
            The data type is float32 or float64.
        epsilon(float, Default 1e-05): A value added to the denominator for
            numerical stability. Default is 1e-5.
        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
+	     If the Initializer of the param_attr is not set, the parameter is initialized
 	     with Xavier. Default: None.
        bias_attr(ParamAttr|None): The parameter attribute for the bias of instance_norm.
             If it is set to None or one attribute of ParamAttr, instance_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
+	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
+	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
 	     Default: None.
        name(string, Default None): A name for this layer(optional). If set None, the layer
            will be named automatically.
    Returns:
-        A Variable holding Tensor which is the result after applying instance normalization on the input, 
+        A Variable holding Tensor which is the result after applying instance normalization on the input,
-        has same shape and data type with input. 
+        has same shape and data type with input.
    Examples:
@@ -3185,7 +3185,7 @@ def data_norm(input,
        act(string, Default None): Activation type, linear|relu|prelu|...
        epsilon(float, Default 1e-05):
        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
-        data_layout (str, optional): Specify the data format of the input, and the data format of the output 
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
@@ -3196,11 +3196,11 @@ def data_norm(input,
        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
        do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance
            should do model average when model average is enabled.
-        slot_dim(int): The embedding dimension of one slot. Slot is a set of one specific feature. In pslib mode, we 
+        slot_dim(int): The embedding dimension of one slot. Slot is a set of one specific feature. In pslib mode, we
            distinguish feature ids by slot and pull their embeddings from parameter server (pslib). The first
            place of the embedding is the historical show number (occurence time of this feature id with a label 0).
-            If the input of this op is concated by slot-wise embeddings, and the show number is zero when this slot 
+            If the input of this op is concated by slot-wise embeddings, and the show number is zero when this slot
-            is new or empty, the normalization result may be impractical. To avoid this, we add slot_dim to locate 
+            is new or empty, the normalization result may be impractical. To avoid this, we add slot_dim to locate
            the show number and judge if the show number is zero. If so, we choose to skip normalization on this
            embedding.
        sync_stats(bool, Default False): When running with multiple GPU cards, using allreduce to sync the
@@ -3213,7 +3213,7 @@ def data_norm(input,
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            hidden1 = fluid.data(name="hidden1", shape=[64, 200])
@@ -3452,7 +3452,7 @@ def group_norm(input,
            Default: None, the default bias parameter attribute is used. For more information, please
            refer to :ref:`api_guide_ParamAttr` .
        act(str, optional): Activation to be applied to the output of group normalization.
-        data_layout(str, optional): Specify the data format of the input, and the data format of the output 
+        data_layout(str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
@@ -3544,7 +3544,7 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
    calculations with U and V for :attr:`power_iters` rounds. Calculations
    as follows:
-    .. math:: 
+    .. math::
        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
@@ -3558,7 +3558,7 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
@@ -3692,13 +3692,13 @@ def conv2d_transpose(input,
           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ]
    Note:
-          The conv2d_transpose can be seen as the backward of the conv2d. For conv2d, 
+          The conv2d_transpose can be seen as the backward of the conv2d. For conv2d,
-          when stride > 1, conv2d maps multiple input shape to the same output shape, 
+          when stride > 1, conv2d maps multiple input shape to the same output shape,
          so for conv2d_transpose, when stride > 1, input shape maps multiple output shape.
-          If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
+          If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`;
-          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
+          else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
-          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
+          and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`,
          conv2d_transpose can compute the kernel size automatically.
    Args:
@@ -3710,15 +3710,15 @@ def conv2d_transpose(input,
            tuple, it must contain two integers, (image_height, image_width). None if use
            filter_size, padding, and stride to calculate output_size.
            If output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None. output_size and filter_size 
+            should follow the formula above. Default: None. output_size and filter_size
            should not be None at the same time.
        filter_size(int|tuple, optional): The filter size. If filter_size is a tuple,
            it must contain two integers, (filter_size_height, filter_size_width).
-            Otherwise, filter_size_height = filter_size_width = filter_size. None if 
+            Otherwise, filter_size_height = filter_size_width = filter_size. None if
-            use output size to calculate filter_size. Default: None. filter_size and 
+            use output size to calculate filter_size. Default: None. filter_size and
            output_size should not be None at the same time.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
+        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -3731,12 +3731,12 @@ def conv2d_transpose(input,
            when `data_format` is `'NHWC'`, `padding` can be in the form
            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width).
            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
        filter_size(int|tuple, optional): The filter size. If filter_size is a tuple,
            it must contain two integers, (filter_size_height, filter_size_width).
-            Otherwise, filter_size_height = filter_size_width = filter_size. None if 
+            Otherwise, filter_size_height = filter_size_width = filter_size. None if
            use output size to calculate filter_size. Default: None.
        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
@@ -3757,27 +3757,27 @@ def conv2d_transpose(input,
            library is installed. Default: True.
        act (str, optional): Activation type, if it is set to None, activation is not appended.
            Default: None.
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           to :ref:`api_guide_Name`. Usually name is no need to set and
           None by default.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
    Returns:
-        A Variable holding Tensor representing the conv2d_transpose, whose 
+        A Variable holding Tensor representing the conv2d_transpose, whose
-        data type is the same with input and shape is (num_batches, channels, out_h, 
+        data type is the same with input and shape is (num_batches, channels, out_h,
-        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable 
+        out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor variable
-        storing the transposed convolution result, and if act is not None, the 
+        storing the transposed convolution result, and if act is not None, the
-        tensor variable storing transposed convolution and non-linearity activation 
+        tensor variable storing transposed convolution and non-linearity activation
        result.
    Raises:
        ValueError: If the type of `use_cudnn` is not bool.
        ValueError: If `data_format` is not "NCHW" or "NHWC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
            or the element corresponding to the input's channel is not 0.
        ValueError: If `output_size` and filter_size are None at the same time.
        ShapeError: If the input is not 4-D Tensor.
@@ -3982,32 +3982,32 @@ def conv3d_transpose(input,
           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ]
    Note:
-          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, 
+          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
-          when stride > 1, conv3d maps multiple input shape to the same output shape, 
+          when stride > 1, conv3d maps multiple input shape to the same output shape,
          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
          If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
-          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
+          H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output
-          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
+          size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`,
-          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
+          the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}`
-          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
+          and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`,
          conv3d_transpose can compute the kernel size automatically.
    Args:
-        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
+        input(Variable): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type
            of input is float32 or float64.
        num_filters(int): The number of the filter. It is as same as the output
            image channel.
        output_size(int|tuple, optional): The output image size. If output size is a
            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
+            parameter only works when filter_size is None. If output_size and filter_size are
-            specified at the same time, They should follow the formula above. Default: None. 
+            specified at the same time, They should follow the formula above. Default: None.
            Output_size and filter_size should not be None at the same time.
        filter_size(int|tuple, optional): The filter size. If filter_size is a tuple,
            it must contain three integers, (filter_size_depth, filter_size_height,
            filter_size_width). Otherwise, filter_size_depth = filter_size_height = \
            filter_size_width = filter_size. None if use output size to
-            calculate filter_size. Default: None. filter_size and output_size should not be 
+            calculate filter_size. Default: None. filter_size and output_size should not be
            None at the same time.
        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively
             adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string,
@@ -4019,13 +4019,13 @@ def conv3d_transpose(input,
            when `data_format` is `'NDHWC'`, `padding` can be in the form
            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
-        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. 
+        stride(int|tuple, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a tuple, it must contain three integers, (stride_depth, stride_height,
-            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
+            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride.
            Default: stride = 1.
-        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. 
+        dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
            Default: dilation = 1.
        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
@@ -4046,26 +4046,26 @@ def conv3d_transpose(input,
            library is installed. Default: True
        act (str, optional): Activation type, if it is set to None, activation is not appended.
            Default: None.
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+           to :ref:`api_guide_Name`. Usually name is no need to set and
           None by default.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
    Returns:
-        A Variable holding Tensor representing the conv3d_transpose, whose data 
+        A Variable holding Tensor representing the conv3d_transpose, whose data
-        type is the same with input and shape is (num_batches, channels, out_d, out_h, 
+        type is the same with input and shape is (num_batches, channels, out_d, out_h,
-        out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
+        out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor
-        variable storing the transposed convolution result, and if act is not None, the tensor 
+        variable storing the transposed convolution result, and if act is not None, the tensor
        variable storing transposed convolution and non-linearity activation result.
    Raises:
        ValueError: If the type of `use_cudnn` is not bool.
        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0
            or the element corresponding to the input's channel is not 0.
        ValueError: If `output_size` and filter_size are None at the same time.
        ShapeError: If the input is not 5-D Tensor.
@@ -4240,7 +4240,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
    Raises:
        TypeError, if out data type is different with the input data type.
    Examples:
        .. code-block:: python
@@ -4304,18 +4304,18 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
            :math:`rank(input) + dim[i]`.
        keep_dim (bool, optional): Whether to reserve the reduced dimension in the
            output Tensor. The result tensor will have one fewer dimension
-            than the :attr:`input` unless :attr:`keep_dim` is true, default 
+            than the :attr:`input` unless :attr:`keep_dim` is true, default
            value is False.
        name(str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
    Returns:
        Variable: Tensor, results of average on the specified dim of input tensor,
        it's data type is the same as input's Tensor.
    Raises:
        TypeError, if out data type is different with the input data type.
    Examples:
        .. code-block:: python
@@ -4380,7 +4380,7 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
            output Tensor. The result tensor will have one fewer dimension
            than the :attr:`input` unless :attr:`keep_dim` is true, default
            value is False.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -4441,7 +4441,7 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
            output Tensor. The result tensor will have one fewer dimension
            than the :attr:`input` unless :attr:`keep_dim` is true, default
            value is False.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -4502,13 +4502,13 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
            output Tensor. The result tensor will have one fewer dimension
            than the :attr:`input` unless :attr:`keep_dim` is true, default
            value is False.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
    Returns:
        Variable: Tensor, result of product on the specified dim of input tensor,
        it's data type is the same as input's Tensor.
    Examples:
        .. code-block:: python
@@ -4558,19 +4558,19 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
            If :attr:`None`, compute the logical and over all elements of
            :attr:`input` and return a Tensor variable with a single element,
            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. The default value is None. 
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. The default value is None.
        keep_dim (bool): Whether to reserve the reduced dimension in the
            output Tensor. The result tensor will have one fewer dimension
            than the :attr:`input` unless :attr:`keep_dim` is true. The default value is False.
        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically. The default value is None. 
+                       will be named automatically. The default value is None.
-    Returns: 
+    Returns:
        Variable, the output data type is bool. : The reduced tensor variable with ``logical and`` in given dims.
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            import paddle.fluid.layers as layers
            import numpy as np
@@ -4581,7 +4581,7 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
            x = layers.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
            x = layers.cast(x, 'bool')
-            out = layers.reduce_all(x)  # False 
+            out = layers.reduce_all(x)  # False
            out = layers.reduce_all(x, dim=0)  # [True, False]
            out = layers.reduce_all(x, dim=-1)  # [False, True]
            # keep_dim=False, x.shape=(2,2), out.shape=(2,)
@@ -4616,13 +4616,13 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
            If :attr:`None`, compute the logical and over all elements of
            :attr:`input` and return a Tensor variable with a single element,
            otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. The default value is None. 
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. The default value is None.
        keep_dim (bool): Whether to reserve the reduced dimension in the
            output Tensor. The result tensor will have one fewer dimension
            than the :attr:`input` unless :attr:`keep_dim` is true. The default value is False.
        name(str|None): A name for this layer(optional). If set None, the layer
-    Returns: 
+    Returns:
        Variable, the output data type is bool. : The reduced tensor variable with ``logical or`` in given dims.
    Examples:
@@ -4832,14 +4832,14 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
        epsilon(float): The epsilon value is used to avoid division by zero, \
            the default value is 1e-12.
 	name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
    Returns:
        Variable: The output has the same shape and data type with `x`.
    Examples:
        .. code-block:: python
 	    # declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
@@ -4848,18 +4848,18 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
 	    input_data = np.random.rand(2,3).astype("float32")
 	    print(input_data)
 	    # [[0.5171216  0.12704141 0.56018186]
 	    # [0.93251234 0.5382788  0.81709313]]
 	    output_data = exe.run(fluid.default_main_program(),
                feed={"input":input_data},
                fetch_list=[output],
                return_numpy=True)
 	    print(output_data)
 	    # [array([[0.48496857, 0.22970329, 0.56545246],
@@ -4872,10 +4872,10 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
    		input = dg.to_variable(input_data)
    		output = fluid.layers.l2_normalize(x=input, axis=-1)
    		print(output.numpy())
 		# [[0.66907585 0.16437206 0.7247892 ]
 		# [0.6899054  0.3982376  0.6045142 ]]
    """
    if len(x.shape) == 1:
@@ -5080,7 +5080,7 @@ def ctc_greedy_decoder(input,
       blanks and delete all blanks.
    This op is implemented in two modes: lod and padding, either of them can be used.
-    The input can be either LoDTensor or Tensor, corresponding to lod and padding 
+    The input can be either LoDTensor or Tensor, corresponding to lod and padding
    mode respectively.
    A simple example as below:
@@ -5134,7 +5134,7 @@ def ctc_greedy_decoder(input,
        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
               [[0], [2], [1], [0]], for input.data[4:8] is [[0], [3], [3], [0]], shape is [2,4,1]
-        step2: Change the argmax result to use padding mode, then argmax result is 
+        step2: Change the argmax result to use padding mode, then argmax result is
                [[0, 2, 1, 0], [0, 3, 3, 0]], shape is [2, 4], lod is [], input_length is [[4], [4]]
        step3: Apply ctc_align to padding argmax result, padding_value is 0
@@ -5146,8 +5146,8 @@ def ctc_greedy_decoder(input,
    Parameters:
-        input(Variable): the probabilities of variable-length sequences. When in lod mode, 
+        input(Variable): the probabilities of variable-length sequences. When in lod mode,
-                         it is a 2-D LoDTensor with LoD information. It's shape is [Lp, num_classes + 1] 
+                         it is a 2-D LoDTensor with LoD information. It's shape is [Lp, num_classes + 1]
                         where Lp is the sum of all input sequences' length and
                         num_classes is the true number of classes. When in padding mode,
                         it is a 3-D Tensor with padding, It's shape is [batch_size, N, num_classes + 1].
@@ -5158,9 +5158,9 @@ def ctc_greedy_decoder(input,
        input_length(Variable, optional): 2-D LoDTensor, shape is [batch_size, 1], data type is int64.
                                 It is used for padding mode. In lod mode, input_length is None.
        padding_value(int): padding value.
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
-                             For more information, please refer to :ref:`api_guide_Name` 
+                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
        For lod mode, returns the result of CTC greedy decoder, 2-D LoDTensor, shape is [Lp, 1], \
@@ -5168,7 +5168,7 @@ def ctc_greedy_decoder(input,
        in result were empty, the result LoDTensor will be [-1] with  empty \
        LoD [[]].
-        For padding mode, returns a tuple of (output, output_length), which was described as below: 
+        For padding mode, returns a tuple of (output, output_length), which was described as below:
        output, 2-D Tensor, shape is [batch_size, N], data type is int64.
@@ -5353,7 +5353,7 @@ def im2sequence(input,
            paddings of four direction.  Or it can contain two integers :math:`[padding\_height, padding\_width]` which means
            padding_up = padding_down = padding_height and
            padding_left = padding_right = padding_width. Otherwise, a scalar padding means
-            padding_up = padding_down = padding_left = padding_right = padding. 
+            padding_up = padding_down = padding_left = padding_right = padding.
            Default is 0.
        input_image_size(Variable, optional): the input contains image real size.It's dim
@@ -5366,9 +5366,9 @@ def im2sequence(input,
        name (str, optional): The default value is None.  Normally there is no need for
                    user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
-    Returns: 
+    Returns:
-            The output is a 2-D LoDTensor with shape {input.batch\_size * output\_height * output\_width, \ 
+            The output is a 2-D LoDTensor with shape {input.batch\_size * output\_height * output\_width, \
            filter\_size\_height * filter\_size\_width * input.channels}. The data type is float32.
    Return Type: Variable
@@ -5617,7 +5617,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
            output= exe.run(feed={"x":x, "y":y},
                             fetch_list=[result])
            print(output)
            #[array([[0.08220536],
            #       [0.36652038],
            #      [0.20541131]], dtype=float32)]
@@ -5681,7 +5681,7 @@ def one_hot(input, depth, allow_out_of_range=False):
        output:
            Out.shape = [4, 4]
            Out.data = [[0., 1., 0., 0.],
-                        [0., 1., 0., 0.], 
+                        [0., 1., 0., 0.],
                        [0., 0., 0., 0.], # This id is 5, which goes beyond depth, so set it all-zeros data.
                        [1., 0., 0., 0.]]
@@ -5694,7 +5694,7 @@ def one_hot(input, depth, allow_out_of_range=False):
            allow_out_of_range = False
        output: Throw an exception for Illegal value
-            The second dimension in X is 5, which is greater than depth.  
+            The second dimension in X is 5, which is greater than depth.
            Allow_out_of_range =False means that does not allow the word id to exceed depth,
            so it throws an exception.
@@ -5702,7 +5702,7 @@ def one_hot(input, depth, allow_out_of_range=False):
        input(Variable): Tensor or LoDTensor with shape :math:`[N_1, N_2, ..., N_k, 1]` ,
            which contains at least one dimension and the last dimension must be 1.
            The data type is int32 or int64.
-        depth(scalar): An integer defining the :attr:`depth` of the one hot dimension. If input 
+        depth(scalar): An integer defining the :attr:`depth` of the one hot dimension. If input
            is word id, depth is generally the dictionary size.
        allow_out_of_range(bool): A bool value indicating whether the input
            indices could be out of range :math:`[0, depth)` . When input indices are
@@ -5754,8 +5754,8 @@ def one_hot(input, depth, allow_out_of_range=False):
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
    """
-    Create an auto-increase variable. which will be automatically increased 
+    Create an auto-increase variable. which will be automatically increased
-    by 1 in every iteration. By default, the first return of this counter is 1, 
+    by 1 in every iteration. By default, the first return of this counter is 1,
    and the step size is 1.
    Args:
@@ -5995,7 +5995,7 @@ def squeeze(input, axes, name=None):
    to one will be deleted.
-    .. code-block:: text 
+    .. code-block:: text
        Case1:
@@ -6314,20 +6314,20 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
    Args:
-        input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W] or [N, H, W, C], 
+        input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W] or [N, H, W, C],
-            where N is the batch size, C is the input channel, H is Height, W is weight. The data 
+            where N is the batch size, C is the input channel, H is Height, W is weight. The data
            type is float32. The rank of this tensor must be 4, otherwise it will raise ValueError.
        n (int, optional): The number of channels to sum over. Default: 5
        k (float, optional): An offset, positive. Default: 1.0
        alpha (float, optional): The scaling parameter, positive. Default:1e-4
        beta (float, optional): The exponent, positive. Default:0.75
-        name (str, optional): The default value is None. Normally there is no need for user to set 
+        name (str, optional): The default value is None. Normally there is no need for user to set
-            this property. For more information, please refer to :ref:`api_guide_Name` 
+            this property. For more information, please refer to :ref:`api_guide_Name`
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
    Returns:
        Variable: A tensor variable storing the transformation result with the same shape and data type as input.
@@ -6408,11 +6408,11 @@ def pad(x, paddings, pad_value=0., name=None):
        x (Variable): Tensor, data type is float32.
        paddings (list): A list of integers. Its elements specify the padded
                         width before and after each dimension in turn.
-                         The length of :attr:`paddings` must be equal to 
+                         The length of :attr:`paddings` must be equal to
                         :math:`rank(x) \\times 2`.
        pad_value (float): The constant value used to pad.
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -6484,11 +6484,11 @@ def pad_constant_like(x, y, pad_value=0., name=None):
                     [ 0,  0,  0]],
                    [[41, 42, 43],
                     [ 0,  0,  0]]],
-                   [[[ 0,  0,  0], 
+                   [[[ 0,  0,  0],
                     [ 0,  0,  0]],
-                    [[ 0,  0,  0], 
+                    [[ 0,  0,  0],
                     [ 0,  0,  0]],
-                    [[ 0,  0,  0], 
+                    [[ 0,  0,  0],
                     [ 0,  0,  0]]]]
            Out.shape = [2, 3, 2, 3]
@@ -6496,11 +6496,11 @@ def pad_constant_like(x, y, pad_value=0., name=None):
    Args:
        x (Variable): Tensor, its shape specifies the shape of output.
-        y (Variable): Tensor, its rank is the same with :attr:`x`, and for each dimension :math:`i` , 
+        y (Variable): Tensor, its rank is the same with :attr:`x`, and for each dimension :math:`i` ,
                      :math:`y\_shape[i] <= x\_shape[i]` . The data type can be float32 or float64.
        pad_value (float): The constant value used to pad.
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -6538,8 +6538,8 @@ def label_smooth(label,
                 dtype="float32",
                 name=None):
    """
-    Label smoothing is a mechanism to regularize the classifier layer and is called 
+    Label smoothing is a mechanism to regularize the classifier layer and is called
-    label-smoothing regularization (LSR). 
+    label-smoothing regularization (LSR).
    Label smoothing is proposed to encourage the model to be less confident,
    since optimizing the log-likelihood of the correct label directly may
@@ -6560,20 +6560,20 @@ def label_smooth(label,
    Parameters:
        label(Variable): The input variable containing the label data. The
-                        label data should use one-hot representation. It's 
+                        label data should use one-hot representation. It's
-                        a multidimensional tensor with a shape of 
+                        a multidimensional tensor with a shape of
                        :math:`[N_1, ..., Depth]`, where Depth is class number.
        prior_dist(Variable, optional): The prior distribution to be used to smooth
                        labels. If not provided, an uniform distribution
                        is used. It's a multidimensional tensor with a shape of
                        :math:`[1, class\_num]` . The default value is None.
        epsilon(float, optional): The weight used to mix up the original ground-truth
-                        distribution and the fixed distribution. The default value is 
+                        distribution and the fixed distribution. The default value is
                        0.1.
        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type can be set
                        as 'float32', 'float64'. The default value is 'float32'.
-        name(str, optional): The default value is None. Normally there is no need for user 
+        name(str, optional): The default value is None. Normally there is no need for user
-                        to set this property. For more information, please refer to 
+                        to set this property. For more information, please refer to
                        :ref:`api_guide_Name`.
    Returns:
@@ -6581,7 +6581,7 @@ def label_smooth(label,
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            import paddle.fluid.layers as layers
@@ -6617,17 +6617,17 @@ def roi_pool(input,
             spatial_scale=1.0,
             rois_lod=None):
    """
-    This operator implements the roi_pooling layer. 
+    This operator implements the roi_pooling layer.
    Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
    The operator has three steps:
        1. Dividing each region proposal into equal-sized sections with the pooled_width and pooled_height;
        2. Finding the largest value in each section;
        3. Copying these max values to the output buffer.
    For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
    Args:
        input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W], where N is the batch size, C is the input channel, H is Height, W is weight. The data type is float32 or float64.
        rois (Variable): ROIs (Regions of Interest) to pool over. 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
@@ -6635,30 +6635,30 @@ def roi_pool(input,
        pooled_height (int, optional): The pooled output height, data type is int32. Default: 1
        pooled_width (int, optional): The pooled output height, data type is int32. Default: 1
        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
    Returns:
        Variable: The pooled feature, 4D-Tensor with the shape of [num_rois, C, pooled_height, pooled_width].
    Examples:
    ..  code-block:: python
        import paddle.fluid as fluid
        import numpy as np
        DATATYPE='float32'
        place = fluid.CPUPlace()
        #place = fluid.CUDAPlace(0)
        input_data = np.array([i for i in range(1,17)]).reshape(1,1,4,4).astype(DATATYPE)
        roi_data =fluid.create_lod_tensor(np.array([[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(DATATYPE),[[2]], place)
        rois_lod_data = np.array([0, 2])
        x = fluid.data(name='input', shape=[None,1,4,4], dtype=DATATYPE)
        rois = fluid.data(name='roi', shape=[None,4], dtype=DATATYPE)
-        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64') 
+        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
        pool_out = fluid.layers.roi_pool(
                input=x,
@@ -6667,7 +6667,7 @@ def roi_pool(input,
                pooled_width=1,
                spatial_scale=1.0,
                rois_lod=rois_lod)
        exe = fluid.Executor(place)
        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_lod': rois_lod_data}, fetch_list=[pool_out.name])
        print(out)   #array([[[[11.]]], [[[16.]]]], dtype=float32)
@@ -6707,8 +6707,8 @@ def roi_align(input,
    Args:
        input (Variable): ${x_comment}
        rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-            a 2-D LoDTensor of shape (num_rois, 4), the lod level is 1. The 
+            a 2-D LoDTensor of shape (num_rois, 4), the lod level is 1. The
-            data type is float32 or float64. Given as [[x1, y1, x2, y2], ...], 
+            data type is float32 or float64. Given as [[x1, y1, x2, y2], ...],
            (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
            right coordinates.
        rois_lod (Variable): The lod info of rois. Default: None
@@ -6716,9 +6716,9 @@ def roi_align(input,
        pooled_width (int32, optional): ${pooled_width_comment} Default: 1
        spatial_scale (float32, optional): ${spatial_scale_comment} Default: 1.0
        sampling_ratio(int32, optional): ${sampling_ratio_comment} Default: -1
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default. 
+            None by default.
    Returns:
        Variable:
@@ -6782,13 +6782,13 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
        input (Variable): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_D]`, where :math:`N_1` is
                          the batch_size, :math:`N_D` is 1. It is usually the output predictions of sigmoid activation.
                          The data type can be float32 or float64.
-        label (Variable): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_D]`. 
+        label (Variable): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_D]`.
                          where :math:`N_1` is the batch_size, :math:`N_D` is 1. The data type can be float32 or float64.
        epsilon (float): The epsilon will be added to the numerator and denominator.
                         If both input and label are empty, it makes sure dice is 1.
                         Default: 0.00001
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -6827,9 +6827,9 @@ def image_resize(input,
    """
    This op resizes a batch of images.
-    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w) 
+    The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w)
-    or (num_batches, in_h, in_w, channels), or a 5-D Tensor of the shape 
+    or (num_batches, in_h, in_w, channels), or a 5-D Tensor of the shape
-    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels), 
+    (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
    and the resizing only applies on the three dimensions(depth, height and width).
    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
@@ -6844,21 +6844,21 @@ def image_resize(input,
        'NEAREST' : Nearest neighbor interpolation
    Nearest neighbor interpolation is to perform nearest neighbor interpolation
-    in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+    in both the 3rd dimension(in height direction) and the 4th dimension(in width
    direction) on input tensor.
-    Bilinear interpolation is an extension of linear interpolation for 
+    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and 
+    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this op) on a rectilinear 2D grid. The key idea is 
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
-    to perform linear interpolation first in one direction, and then 
+    to perform linear interpolation first in one direction, and then
    again in the other direction.
-    Trilinear interpolation is an extension of linear interpolation for 
+    Trilinear interpolation is an extension of linear interpolation for
-    interpolating functions of three variables (e.g. D-direction, 
+    interpolating functions of three variables (e.g. D-direction,
-    H-direction and W-direction in this op) on a rectilinear 3D grid. 
+    H-direction and W-direction in this op) on a rectilinear 3D grid.
    The linear interpolation is performed on three directions.
-    Align_corners and align_mode are optional parameters,the calculation method 
+    Align_corners and align_mode are optional parameters,the calculation method
    of interpolation can be selected by them.
    Example:
@@ -6866,18 +6866,18 @@ def image_resize(input,
    .. code-block:: text
        For scale:
            if align_corners = True && out_size > 1 :
              scale_factor = (in_size-1.0)/(out_size-1.0)
            else:
              scale_factor = float(in_size/out_size)
        Nearest neighbor interpolation:
          if:
              align_corners = False
@@ -6900,15 +6900,15 @@ def image_resize(input,
          if:
              align_corners = False , align_mode = 0
              input : (N,C,H_in,W_in)
              output: (N,C,H_out,W_out) where:
              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
          else:
              input : (N,C,H_in,W_in)
              output: (N,C,H_out,W_out) where:
@@ -6919,31 +6919,31 @@ def image_resize(input,
          if:
              align_corners = False , align_mode = 0
              input : (N,C,D_in,H_in,W_in)
              output: (N,C,D_out,H_out,W_out) where:
              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
          else:
              input : (N,C,D_in,H_in,W_in)
              output: (N,C,D_out,H_out,W_out) where:
              D_out = D_{in} * scale_{factor}
              H_out = H_{in} * scale_{factor}
              W_out = W_{in} * scale_{factor}
-    For details of nearest neighbor interpolation, please refer to Wikipedia: 
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    For details of bilinear interpolation, please refer to Wikipedia: 
+    For details of bilinear interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    For details of trilinear interpolation, please refer to Wikipedia: 
+    For details of trilinear interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Trilinear_interpolation.
@@ -6953,7 +6953,7 @@ def image_resize(input,
                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): Output shape of image resize
             layer, the shape is (out_h, out_w) when input is a 4-D Tensor and is
-             (out_d, out_h, out_w) when input is a 5-D Tensor. Default: None. If 
+             (out_d, out_h, out_w) when input is a 5-D Tensor. Default: None. If
             a list, each element can be an integer or a Tensor Variable of shape: [1].
             If a Tensor Variable, its dimensions size should be a 1.
        scale(float|Variable|None): The multiplier for the input height or width. At
@@ -6970,24 +6970,24 @@ def image_resize(input,
                                :attr:`out_shape` and :attr:`scale` specifying
                                shape. That is to say actual_shape has the
                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
+                                :attr:`out_shape` if you want to specify output
-                                shape dynamically, because :attr:`actual_shape` 
+                                shape dynamically, because :attr:`actual_shape`
-                                will be deprecated. When using actual_shape to 
+                                will be deprecated. When using actual_shape to
-                                specify output shape, one of :attr:`out_shape` 
+                                specify output shape, one of :attr:`out_shape`
-                                and :attr:`scale` should also be set, otherwise 
+                                and :attr:`scale` should also be set, otherwise
                                errors would be occurred in graph constructing stage.
                                Default: None
-        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the 
+        align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
-                               input and output tensors are aligned, preserving the values at the 
+                               input and output tensors are aligned, preserving the values at the
                               corner pixels.
                               Default: True
-        align_mode(int)  :  An optional for bilinear interpolation. can be \'0\' 
+        align_mode(int)  :  An optional for bilinear interpolation. can be \'0\'
-                            for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for 
+                            for src_idx = scale*(dst_indx+0.5)-0.5 , can be \'1\' for
                            src_idx = scale*dst_index.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`, `"NCDHW"`,
            `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored 
+            `[batch_size, input_channels, input_height, input_width]`. When it is `"NCHW"`, the data is stored
            in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
    Returns:
@@ -7011,7 +7011,7 @@ def image_resize(input,
    Examples:
        .. code-block:: python
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
@@ -7041,14 +7041,14 @@ def image_resize(input,
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
 	    input_data = np.random.rand(2,3,6,10).astype("float32")
 	    output_data = exe.run(fluid.default_main_program(),
                feed={"input":input_data},
                fetch_list=[output],
                return_numpy=True)
 	    print(output_data[0].shape)
 	    #1
@@ -7231,7 +7231,7 @@ def resize_bilinear(input,
    output shape which specified by actual_shape, out_shape and scale
    in priority order.
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated in
    the future and only use :attr:`out_shape` instead.
    Bilinear interpolation is an extension of linear interpolation for
@@ -7243,7 +7243,7 @@ def resize_bilinear(input,
    For details of bilinear interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Bilinear_interpolation
-    Align_corners and align_mode are optional parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation
    method of interpolation can be selected by them.
    Example:
@@ -7251,23 +7251,23 @@ def resize_bilinear(input,
    .. code-block:: text
        For scale:
            if align_corners = True && out_size > 1 :
              scale_factor = (in_size-1.0)/(out_size-1.0)
            else:
              scale_factor = float(in_size/out_size)
        Bilinear interpolation:
          if:
              align_corners = False , align_mode = 0
              input : (N,C,H_in,W_in)
              output: (N,C,H_out,W_out) where:
              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
@@ -7282,12 +7282,12 @@ def resize_bilinear(input,
        input(Variable): 4-D Tensor(NCHW), its data type is float32, float64, or uint8,
                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
-            layer, the shape is (out_h, out_w).Default: None. If a list, each 
+            layer, the shape is (out_h, out_w).Default: None. If a list, each
-            element can be an integer or a Tensor Variable with shape: [1]. If a 
+            element can be an integer or a Tensor Variable with shape: [1]. If a
            Tensor Variable, its dimension size should be 1.
        scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             least one of :attr:`out_shape` or :attr:`scale` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`.
             Default: None.
        actual_shape(Variable): An optional input to specify output shape
                                dynamically. If provided, image resize
@@ -7295,16 +7295,16 @@ def resize_bilinear(input,
                                :attr:`out_shape` and :attr:`scale` specifying
                                shape. That is to say actual_shape has the
                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
+                                :attr:`out_shape` if you want to specify output
-                                shape dynamically, because :attr:`actual_shape` 
+                                shape dynamically, because :attr:`actual_shape`
-                                will be deprecated. When using actual_shape to 
+                                will be deprecated. When using actual_shape to
-                                specify output shape, one of :attr:`out_shape` 
+                                specify output shape, one of :attr:`out_shape`
-                                and :attr:`scale` should also be set, otherwise 
+                                and :attr:`scale` should also be set, otherwise
                                errors would be occurred in graph constructing stage.
                                Default: None
        align_corners(bool): ${align_corners_comment}
        align_mode(bool): ${align_mode_comment}
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
@@ -7312,10 +7312,10 @@ def resize_bilinear(input,
    Returns:
 	Variable: 4-D tensor(NCHW or NHWC).
    Examples:
        .. code-block:: python
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
@@ -7345,14 +7345,14 @@ def resize_bilinear(input,
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
 	    input_data = np.random.rand(2,3,6,10).astype("float32")
 	    output_data = exe.run(fluid.default_main_program(),
                feed={"input":input_data},
                fetch_list=[output],
                return_numpy=True)
 	    print(output_data[0].shape)
 	    #1
@@ -7394,18 +7394,18 @@ def resize_trilinear(input,
    output shape which specified by actual_shape, out_shape and scale
    in priority order.
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated
    in the future and only use :attr:`out_shape` instead.
-    Trilinear interpolation is an extension of linear interpolation for 
+    Trilinear interpolation is an extension of linear interpolation for
-    interpolating functions of three variables (e.g. D-direction, 
+    interpolating functions of three variables (e.g. D-direction,
-    H-direction and W-direction in this op) on a rectilinear 3D grid. 
+    H-direction and W-direction in this op) on a rectilinear 3D grid.
    The linear interpolation is performed on three directions.
    For details of trilinear interpolation, please refer to Wikipedia:
    https://en.wikipedia.org/wiki/Trilinear_interpolation
-    Align_corners and align_mode are optional parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation
    method of interpolation can be selected by them.
    Example:
@@ -7413,24 +7413,24 @@ def resize_trilinear(input,
    .. code-block:: text
        For scale:
            if align_corners = True && out_size > 1 :
              scale_factor = (in_size-1.0)/(out_size-1.0)
            else:
-              scale_factor = float(in_size/out_size)     
+              scale_factor = float(in_size/out_size)
        Bilinear interpolation:
          if:
              align_corners = False , align_mode = 0
              input : (N,C,D_in,H_in,W_in)
              output: (N,C,D_out,H_out,W_out) where:
              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
@@ -7449,8 +7449,8 @@ def resize_trilinear(input,
                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): The output shape of resized tensor, the shape is (out_d, out_h, out_w). Default: None. Every element should be an integer or a Tensor Variable with shape: [1] if it is a list. If it is a Tensor Variable, its dimension size should be 1.
        scale(float|Variable|None): The multiplier for the input depth, height or width.
-             At least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             At least one of :attr:`out_shape` or :attr:`scale` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`.
             Default: None.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
        actual_shape(Variable): An optional input to specify output shape
@@ -7459,26 +7459,26 @@ def resize_trilinear(input,
                                :attr:`out_shape` and :attr:`scale` specifying
                                shape. That is to say actual_shape has the
                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
+                                :attr:`out_shape` if you want to specify output
-                                shape dynamically, because :attr:`actual_shape` 
+                                shape dynamically, because :attr:`actual_shape`
-                                will be deprecated. When using actual_shape to 
+                                will be deprecated. When using actual_shape to
-                                specify output shape, one of :attr:`out_shape` 
+                                specify output shape, one of :attr:`out_shape`
-                                and :attr:`scale` should also be set, otherwise 
+                                and :attr:`scale` should also be set, otherwise
                                errors would be occurred in graph constructing stage.
                                Default: None
        align_corners(bool): ${align_corners_comment}
        align_mode(bool): ${align_mode_comment}
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
            The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_depth, input_height, input_width]`.
    Returns:
-        Variable: A 5-D Tensor(NCDHW or NDHWC) 
+        Variable: A 5-D Tensor(NCDHW or NDHWC)
    Examples:
        .. code-block:: python
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
@@ -7508,14 +7508,14 @@ def resize_trilinear(input,
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
 	    input_data = np.random.rand(2,3,6,8,10).astype("float32")
 	    output_data = exe.run(fluid.default_main_program(),
                feed={"input":input_data},
                fetch_list=[output],
                return_numpy=True)
 	    print(output_data[0].shape)
 	    #1
@@ -7555,10 +7555,10 @@ def resize_nearest(input,
                   data_format='NCHW'):
    """
    This op resizes the input by performing nearest neighbor interpolation in both the
-    height direction and the width direction based on given output shape 
+    height direction and the width direction based on given output shape
    which is specified by actual_shape, out_shape and scale in priority order.
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated in the
    future and only use :attr:`out_shape` instead.
    Example:
@@ -7566,16 +7566,16 @@ def resize_nearest(input,
    .. code-block:: text
        For scale:
            if align_corners = True && out_size > 1 :
              scale_factor = (in_size-1.0)/(out_size-1.0)
            else:
              scale_factor = float(in_size/out_size)
        Nearest neighbor interpolation:
          if:
              align_corners = False
@@ -7603,9 +7603,9 @@ def resize_nearest(input,
                          its data format is specified by :attr:`data_format`.
        out_shape(list|tuple|Variable|None): The output shape of resized tensor, the shape is (out_h, out_w). Default: None. Every element should be an integer or a tensor Variable with shape: [1] if it is a list. If it is a tensor Variable, its dimension size should be 1.
        scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             least one of :attr:`out_shape` or :attr:`scale` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`.
-             Default: None. 
+             Default: None.
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 	actual_shape(Variable): An optional input to specify output shape
                                dynamically. If provided, image resize
@@ -7613,15 +7613,15 @@ def resize_nearest(input,
                                :attr:`out_shape` and :attr:`scale` specifying
                                shape. That is to say actual_shape has the
                                highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
+                                :attr:`out_shape` if you want to specify output
-                                shape dynamically, because :attr:`actual_shape` 
+                                shape dynamically, because :attr:`actual_shape`
-                                will be deprecated. When using actual_shape to 
+                                will be deprecated. When using actual_shape to
-                                specify output shape, one of :attr:`out_shape` 
+                                specify output shape, one of :attr:`out_shape`
-                                and :attr:`scale` should also be set, otherwise 
+                                and :attr:`scale` should also be set, otherwise
                                errors would be occurred in graph constructing stage.
                                Default: None
        align_corners(bool): ${align_corners_comment}
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
            `[batch_size, input_channels, input_height, input_width]`.
@@ -7631,7 +7631,7 @@ def resize_nearest(input,
    Examples:
        .. code-block:: python
 	    #declarative mode
 	    import paddle.fluid as fluid
 	    import numpy as np
@@ -7661,14 +7661,14 @@ def resize_nearest(input,
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
 	    input_data = np.random.rand(2,3,6,10).astype("float32")
 	    output_data = exe.run(fluid.default_main_program(),
                feed={"input":input_data},
                fetch_list=[output],
                return_numpy=True)
 	    print(output_data[0].shape)
 	    #1
@@ -7772,15 +7772,15 @@ def gather(input, index, overwrite=True):
                       [5, 6]]
    Args:
-        input (Variable): The source input tensor with rank>=1. Supported data type is 
+        input (Variable): The source input tensor with rank>=1. Supported data type is
-            int32, int64, float32, float64 and uint8 (only for CPU), 
+            int32, int64, float32, float64 and uint8 (only for CPU),
            float16 (only for GPU).
        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
        overwrite (bool, optional): The mode that updating the grad when has same index.
            If True, use the overwrite mode to update the grad of the same index,
-	    if False, use the accumulate mode to update the grad of the same index. 
+	    if False, use the accumulate mode to update the grad of the same index.
 	    Default value is True.
    Returns:
@@ -7811,10 +7811,10 @@ def gather_nd(input, index, name=None):
    """
    **Gather Nd Layer**
-    This function is actually a high-dimensional extension of :code:`gather` 
+    This function is actually a high-dimensional extension of :code:`gather`
-    and supports for simultaneous indexing by multiple axes. :attr:`index` is a 
+    and supports for simultaneous indexing by multiple axes. :attr:`index` is a
-    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional 
+    K-dimensional integer tensor, which is regarded as a (K-1)-dimensional
-    tensor of :attr:`index` into :attr:`input`, where each element defines 
+    tensor of :attr:`index` into :attr:`input`, where each element defines
    a slice of params:
    .. math::
@@ -7837,9 +7837,9 @@ def gather_nd(input, index, name=None):
            * Case 1:
                index = [[1]]
-                gather_nd(input, index)  
+                gather_nd(input, index)
-                         = [input[1, :, :]] 
+                         = [input[1, :, :]]
                         = [[12, 13, 14, 15],
                            [16, 17, 18, 19],
                            [20, 21, 22, 23]]
@@ -7897,7 +7897,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
    .. code-block:: python
        import numpy as np
        #input:
        input = np.array([[1, 1], [2, 2], [3, 3]])
        index = np.array([2, 1, 0, 1])
@@ -7927,7 +7927,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
        overwrite (bool): The mode that updating the output when there are same indices.
            If True, use the overwrite mode to update the output of the same index,
-	    if False, use the accumulate mode to update the output of the same index. 
+	    if False, use the accumulate mode to update the output of the same index.
 	    Default value is True.
    Returns:
@@ -7977,11 +7977,11 @@ def scatter_nd_add(ref, index, updates, name=None):
    **Scatter_nd_add Layer**
    Output is obtained by applying sparse addition to a single value
-    or slice in a Variable. 
+    or slice in a Variable.
-    :attr:`ref` is a Tensor with rank :math:`R` 
+    :attr:`ref` is a Tensor with rank :math:`R`
-    and :attr:`index` is a Tensor with rank :math:`K` . Thus, :attr:`index` 
+    and :attr:`index` is a Tensor with rank :math:`K` . Thus, :attr:`index`
-    has shape :math:`[i_0, i_1, ..., i_{K-2}, Q]` where :math:`Q \leq R` . :attr:`updates` 
+    has shape :math:`[i_0, i_1, ..., i_{K-2}, Q]` where :math:`Q \leq R` . :attr:`updates`
    is a Tensor with rank :math:`K - 1 + R - Q` and its
    shape is :math:`index.shape[:-1] + ref.shape[index.shape[-1]:]` .
@@ -7990,7 +7990,7 @@ def scatter_nd_add(ref, index, updates, name=None):
    which is obtained by the last one dimension of :attr:`index` .
    .. code-block:: text
        Given:
        * Case 1:
@@ -7999,7 +7999,7 @@ def scatter_nd_add(ref, index, updates, name=None):
            updates = [9, 10, 11, 12]
          we get:
            output = [0, 22, 12, 14, 4, 5]
        * Case 2:
@@ -8012,7 +8012,7 @@ def scatter_nd_add(ref, index, updates, name=None):
            updates.shape = (2, 2, 2)
          we get:
            output = [[67, 19], [-16, -27]]
    Args:
@@ -8057,13 +8057,13 @@ def scatter_nd(index, updates, shape, name=None):
    """
    **Scatter_nd Layer**
-    Output is obtained by scattering the :attr:`updates` in a new tensor according 
+    Output is obtained by scattering the :attr:`updates` in a new tensor according
-    to :attr:`index` . This op is similar to :code:`scatter_nd_add`, except the 
+    to :attr:`index` . This op is similar to :code:`scatter_nd_add`, except the
-    tensor of :attr:`shape` is zero-initialized. Correspondingly, :code:`scatter_nd(index, updates, shape)` 
+    tensor of :attr:`shape` is zero-initialized. Correspondingly, :code:`scatter_nd(index, updates, shape)`
-    is equal to :code:`scatter_nd_add(fluid.layers.zeros(shape, updates.dtype), index, updates)` . 
+    is equal to :code:`scatter_nd_add(fluid.layers.zeros(shape, updates.dtype), index, updates)` .
-    If :attr:`index` has repeated elements, then the corresponding updates are accumulated. 
+    If :attr:`index` has repeated elements, then the corresponding updates are accumulated.
-    Because of the numerical approximation issues, the different order of repeated elements 
+    Because of the numerical approximation issues, the different order of repeated elements
-    in :attr:`index` may cause different results. The specific calculation method can be 
+    in :attr:`index` may cause different results. The specific calculation method can be
    seen :code:`scatter_nd_add` . This op is the inverse of the :code:`gather_nd` op.
    Args:
@@ -8156,7 +8156,7 @@ def log(x, name=None):
    Args:
        x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
        name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
    Returns:
        Variable: The natural log of the input LoDTensor or Tensor computed element-wise.
@@ -8239,14 +8239,14 @@ def selu(x, scale=None, alpha=None, name=None):
    Selu Operator.
    The equation is:
    .. math::
        selu= \\lambda*
        \\begin{cases}
            x                      &\\quad \\text{ if } x>0 \n
            \\alpha * e^x - \\alpha  &\\quad \\text{ if } x<=0
        \\end{cases}
    The input `X` can carry the LoD (Level of Details) information,
    or not. And the output shares the LoD information with input `X`.
@@ -8270,7 +8270,7 @@ def selu(x, scale=None, alpha=None, name=None):
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            import numpy as np
@@ -8322,7 +8322,7 @@ def mean_iou(input, label, num_classes):
                           Its shape should be the same as input.
        num_classes (int32): The possible number of labels.
-    Returns: 
+    Returns:
 	Three Variables.
        - mean_iou(Variable) : A 1-D Tensor representing the mean intersection-over-union with shape [1]. \
@@ -8330,8 +8330,8 @@ def mean_iou(input, label, num_classes):
        - out_wrong(Variable) : A 1-D Tensor with shape [num_classes]. Data type is int32. \
 			     The wrong numbers of each class.
        - out_correct(Variable): A 1-D  Tensor with shape [num_classes]. Data type is int32. The correct numbers of each class.
    Examples:
        .. code-block:: python
@@ -8401,7 +8401,7 @@ def crop(x, shape=None, offsets=None, name=None):
        x (Variable): Tensor, data type can be float32 or float64.
        shape (Variable|list/tuple of integers): The output shape is specified
            by `shape`, which can be a Tensor or a list/tuple of integers.
-            If it is a Tensor, it's rank must be the same as `x` , only 
+            If it is a Tensor, it's rank must be the same as `x` , only
            it's shape will be used, and the value of it will be ignored. This way
            is suitable for the case that the output shape may be changed each
            iteration. If it is a list/tuple of integers, it's length must be the same
@@ -8412,9 +8412,9 @@ def crop(x, shape=None, offsets=None, name=None):
            This way is suitable for the case that the offsets may be changed
            each iteration. If it is a list/tuple of integers, it's length must be the
            same as the rank of `x`. If None, the offsets are 0 at each dimension.
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name` . Usually name is no need to set and 
+            to :ref:`api_guide_Name` . Usually name is no need to set and
-            None by default. 
+            None by default.
    Returns:
        The cropped Tensor, which has the same rank and data type with `x`
@@ -8680,7 +8680,7 @@ def affine_grid(theta, out_shape, name=None):
        name(str|None): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
-        Variable: A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`. 
+        Variable: A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`.
    Raises:
        ValueError: If the type of arguments is not supported.
@@ -8835,7 +8835,7 @@ def elu(x, alpha=1.0, name=None):
    Args:
        x(${x_type}): ${x_comment}
        alpha(${alpha_type}|1.0): ${alpha_comment}
-        name(str|None): The default value is None. Normally there is no need for user to set this property. 
+        name(str|None): The default value is None. Normally there is no need for user to set this property.
                        For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        ${out_type}: ${out_comment}
@@ -8846,7 +8846,7 @@ def elu(x, alpha=1.0, name=None):
            import paddle.fluid as fluid
            import numpy as np
            input_elu = np.array([[-1,6],[1,15.6]])
            with fluid.dygraph.guard():
                x = fluid.dygraph.to_variable(input_elu)
@@ -8970,7 +8970,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
                        will be named automatically.
    Returns:
-        output(${out_type}): ${out_comment}. 
+        output(${out_type}): ${out_comment}.
    Examples:
@@ -9047,17 +9047,17 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 def swish(x, beta=1.0, name=None):
    """
    Elementwise swish activation function. See `Searching for Activation Functions <https://arxiv.org/abs/1710.05941>`_ for more details.
    Equation:
    .. math::
        out = \\frac{x}{1 + e^{- beta * x}}
    Args:
        x(Variable): Tensor or LoDTensor, dtype: float32 or float64, the input of swish activation.
        beta(float): Constant beta of swish operator, default 1.0.
        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -9067,23 +9067,23 @@ def swish(x, beta=1.0, name=None):
    Examples:
        .. code-block:: python
            # declarative mode
            import numpy as np
            from paddle import fluid
            x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
            y = fluid.layers.swish(x, beta=2.0)
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            start = fluid.default_startup_program()
            main = fluid.default_main_program()
            data = np.random.randn(2, 3).astype("float32")
            exe.run(start)
            y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
            data
            # array([[-1.1239197 ,  1.3391294 ,  0.03921051],
            #        [ 1.1970421 ,  0.02440812,  1.2055548 ]], dtype=float32)
@@ -9098,7 +9098,7 @@ def swish(x, beta=1.0, name=None):
            import numpy as np
            from paddle import fluid
            import paddle.fluid.dygraph as dg
            data = np.random.randn(2, 3).astype("float32")
            place = fluid.CPUPlace()
            with dg.guard(place) as g:
@@ -9141,13 +9141,13 @@ def prelu(x, mode, param_attr=None, name=None):
    Args:
        x (Variable): The input Tensor or LoDTensor with data type float32.
-        mode (str): The mode for weight sharing. 
+        mode (str): The mode for weight sharing.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
          weight (alpha), it can be create by ParamAttr. None by default.
          For detailed information, please refer to :ref:`api_fluid_ParamAttr`.
-        name(str|None): For detailed information, please refer 
+        name(str|None): For detailed information, please refer
-          to :ref:`api_guide_Name`. Usually name is no need to set and 
+          to :ref:`api_guide_Name`. Usually name is no need to set and
-          None by default. 
+          None by default.
    Returns:
        Variable:
@@ -9202,7 +9202,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
        x(${x_type}): ${x_comment}
        t_min(${t_min_type}|0.0): ${t_min_comment}
        t_max(${t_max_type}|24.0): ${t_max_comment}
-        name(str|None): The default value is None. Normally there is no need for user to set this property. 
+        name(str|None): The default value is None. Normally there is no need for user to set this property.
                        For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        ${out_type}: ${out_comment}
@@ -9213,14 +9213,14 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
            import paddle.fluid as fluid
            import numpy as np
            input_brelu = np.array([[-1,6],[1,15.6]])
            with fluid.dygraph.guard():
                x = fluid.dygraph.to_variable(input_brelu)
                y = fluid.layers.brelu(x, t_min=1.0, t_max=10.0)
                print(y.numpy())
                #[[ 1.  6.]
-                #[ 1. 10.]] 
+                #[ 1. 10.]]
    """
    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'brelu')
@@ -9297,8 +9297,8 @@ def soft_relu(x, threshold=40.0, name=None):
    Examples:
-        .. code-block:: python 
+        .. code-block:: python
            import paddle.fluid as fluid
            import numpy as np
@@ -9520,14 +9520,14 @@ def stack(x, axis=0):
 def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
    """
    **Filter By Instag Layer**
-    This function filter a batch of ins by instag, 
+    This function filter a batch of ins by instag,
-    There are multiple ins, and every ins belongs to some tags. 
+    There are multiple ins, and every ins belongs to some tags.
    We can specify some tags we want. So the ins which belongs to that tags
    remains in the output, and others removed.
-    For example, one batch has 4 ins. Every ins has its tag list. 
+    For example, one batch has 4 ins. Every ins has its tag list.
       | Ins   |   Ins_Tag |
       |:-----:|:------:|
       |  0    |   0, 1 |
@@ -9543,7 +9543,7 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
    So Ins 0 and Ins 1 can pass and be seen in the output,
    Ins 2 and 3 cannot pass because they do not has tag 1.
-    Actually, if is_lod is false, it is normal tensor that equals to 
+    Actually, if is_lod is false, it is normal tensor that equals to
    lod_tensor with all 1, similar to the example above.
    Args:
@@ -9551,7 +9551,7 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
                        And first dimension can have lod info or not.
        ins_tag (Variable): Input Variable (LoDTensor), usually it is 1D list
                        And split them by lod info
-        filter_tag (Variable): Input Variable (1D Tensor/List), usually it is 
+        filter_tag (Variable): Input Variable (1D Tensor/List), usually it is
                        list that holds the tags.
        is_lod (Bool): Boolean value to indicate ins is lod tensor or not.
        out_val_if_empty(Int64): If the output after filter is empty, this value
@@ -9568,7 +9568,7 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
          ins_tag = layers.data(name='Ins_tag', shape=[-1,16], lod_level=0, dtype='int64')
          filter_tag = layers.data(name='Filter_tag', shape=[-1,16], dtype='int64')
          out, loss_weight = layers.filter_by_instag(ins,  ins_tag,  filter_tag, True)
    """
    helper = LayerHelper('filter_by_instag', **locals())
@@ -9776,7 +9776,7 @@ def expand_as(x, target_tensor, name=None):
                   [[4], [5], [6]]
                ]
-        target_tensor's shape:  [2, 6, 2] 
+        target_tensor's shape:  [2, 6, 2]
        Output(Out) is a 3-D tensor with shape [2, 6, 2]:
@@ -9784,7 +9784,7 @@ def expand_as(x, target_tensor, name=None):
                    [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
                    [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
                ]
    Args:
        x (Variable): A Tensor with dtype float64, float32, int32.
@@ -9793,22 +9793,22 @@ def expand_as(x, target_tensor, name=None):
        target_tensor for expanding to Input(X). Only use target_tensor'shape.
    Returns:
-        Variable: A Tensor with dtype float64, float32, int32. 
+        Variable: A Tensor with dtype float64, float32, int32.
-        After expanding, size of each dimension of Output(Out) is equal to the size 
+        After expanding, size of each dimension of Output(Out) is equal to the size
        of the corresponding dimension of target_tensor multiplying the corresponding
        value given by target_tensor.
    Examples:
        .. code-block:: python
        import paddle.fluid as fluid
        import numpy as np
        data = fluid.layers.data(name="data", shape=[-1,10], dtype='float64')
        target_tensor = fluid.layers.data(
          name="target_tensor", shape=[-1,20], dtype='float64')
-        result = fluid.layers.expand_as(x=data, target_tensor=target_tensor) 
+        result = fluid.layers.expand_as(x=data, target_tensor=target_tensor)
        use_cuda = False
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
@@ -9854,31 +9854,31 @@ def uniform_random_batch_size_like(input,
                shape=[2,4]
            result.shape[output_dim_idx] = input.shape[input_dim_idx],
-            output_dim_idx = 0, 
+            output_dim_idx = 0,
            input_dim_idx = 0,
-            result.shape[0] = input.shape[0], 
+            result.shape[0] = input.shape[0],
            then:
                result=[[ 0.3443427 , -0.23056602,  0.3477049 ,  0.06139076]]    # result.shape=[1,4]
       *Case 2:
           Given:
               input =[[0.946741  , 0.1357001 , 0.38086128]]     # input.shape=[1,3]
               shape=[2,4]
               input_dim_idx=1
               output_dim_idx=1
           result.shape[output_dim_idx] = input.shape[input_dim_idx],
-           output_dim_idx = 1, 
+           output_dim_idx = 1,
           input_dim_idx = 1,
-           result.shape[1] = input.shape[1], 
+           result.shape[1] = input.shape[1],
           then:
               result=[[-0.23133647, -0.84195036,  0.21441269],
                       [-0.08774924,  0.25605237, -0.09403259]]    # result.shape=[2,3]
    Args:
        input (Variable): A Tensor. Supported data types: float32, float64.
        shape (tuple|list): A python list or python tuple. The shape of the output Tensor, the data type is int.
-        input_dim_idx (int, optional): An index used to get the input dimension value which will be used to resize the output dimension. Default  0. 
+        input_dim_idx (int, optional): An index used to get the input dimension value which will be used to resize the output dimension. Default  0.
        output_dim_idx (int, optional): An index used to indicate the specific dimension that will be replaced by corresponding input dimension value. Default 0.
        min (float, optional): The lower bound on the range of random values to generate, the min is included in the range. Default -1.0.
        max (float, optional): The upper bound on the range of random values to generate, the max is excluded in the range. Default 1.0.
@@ -9891,15 +9891,15 @@ def uniform_random_batch_size_like(input,
        .. code-block:: python
            import paddle.fluid as fluid
-            # example 1: 
+            # example 1:
            input = fluid.data(name="input", shape=[1, 3], dtype='float32')
            out_1 = fluid.layers.uniform_random_batch_size_like(input, [2, 4]) # out_1.shape=[1, 4]
-            # example 2: 
+            # example 2:
            out_2 = fluid.layers.uniform_random_batch_size_like(input, [2, 4], input_dim_idx=1, output_dim_idx=1) # out_2.shape=[2, 3]
    """
    helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -9929,13 +9929,13 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
    Args:
        shape (Tuple[int] | List[int]): Shape of the generated random tensor.
        mean (float): Mean of the random tensor, defaults to 0.0.
        std (float): Standard deviation of the random tensor, defaults to 1.0.
        seed (int): ${seed_comment}
        dtype(np.dtype | core.VarDesc.VarType | str): Output data type, float32 or float64.
    Returns:
@@ -9943,18 +9943,18 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
    Examples:
       .. code-block:: python
-           # declarative mode 
+           # declarative mode
           import numpy as np
           from paddle import fluid
           x = fluid.layers.gaussian_random((2, 3), std=2., seed=10)
           place = fluid.CPUPlace()
           exe = fluid.Executor(place)
           start = fluid.default_startup_program()
           main = fluid.default_main_program()
           exe.run(start)
           x_np, = exe.run(main, feed={}, fetch_list=[x])
@@ -9968,11 +9968,11 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
           import numpy as np
           from paddle import fluid
           import paddle.fluid.dygraph as dg
           place = fluid.CPUPlace()
           with dg.guard(place) as g:
               x = fluid.layers.gaussian_random((2, 4), mean=2., dtype="float32", seed=10)
-               x_np = x.numpy()       
+               x_np = x.numpy()
           x_np
           # array([[2.3060477 , 2.676496  , 3.9911983 , 0.9990833 ],
           #        [2.8675377 , 2.2279181 , 0.79029655, 2.8447366 ]], dtype=float32)
@@ -10005,7 +10005,7 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
        x (Variable): 2-D tensor, [batch_size, input_feature_dimensions]
        min (Float): minimum , default 0.0.
        max (Float): maximum, default 1.0.
-        seed (Float): Random seed, default 0. if seed is not 0, will generate same number every time. 
+        seed (Float): Random seed, default 0. if seed is not 0, will generate same number every time.
        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
    Returns:
@@ -10095,7 +10095,7 @@ def gaussian_random_batch_size_like(input,
 def sum(x):
    """
    ${comment}
    Case 1:
    ::
        Input:
@@ -10155,8 +10155,8 @@ def sum(x):
            # the sum of input0 and input1 is 2-D Tensor with shape [2,3].
            # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
            #       and '__int64' on Windows. They both represent 64-bit integer variables.
    """
@@ -10363,7 +10363,7 @@ def strided_slice(input, axes, starts, ends, strides):
                strides = [1, 1]
            Then:
                result = [ [5, 6, 7], ]
        Case2:
            Given:
                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
@@ -10373,7 +10373,7 @@ def strided_slice(input, axes, starts, ends, strides):
                strides = [1, -1]
            Then:
                result = [ [8, 7, 6], ]
        Case3:
            Given:
                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
@@ -10669,7 +10669,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
        bias(float): The bias to be put on the input.
        bias_after_scale(bool): Apply bias addition after or before scaling. It is useful for numeric stability in some circumstances.
        act(str, optional): Activation applied to the output such as tanh, softmax, sigmoid, relu.
-        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` 
+        name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
    Returns:
        Variable(Tensor|LoDTensor): Output tensor of scale operator, with shape and data type same as input.
@@ -10804,7 +10804,7 @@ Examples:
                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
                "y": np.random.randint(1, 5, size=[5]).astype('float32')
            }
        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
        y = fluid.data(name="y", shape=[5], dtype='float32')
        z = fluid.layers.elementwise_add(x, y, axis=3)
@@ -10888,7 +10888,7 @@ Examples:
                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
                "y": np.random.randint(1, 5, size=[5]).astype('float32')
            }
        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
        y = fluid.data(name="y", shape=[5], dtype='float32')
        z = fluid.layers.elementwise_div(x, y, axis=3)
@@ -10896,7 +10896,7 @@ Examples:
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        z_value = exe.run(feed=gen_data(),
                            fetch_list=[z.name])
        print(z_value) # z.shape=[2,3,4,5]
@@ -10972,7 +10972,7 @@ Examples:
                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
                "y": np.random.randint(1, 5, size=[5]).astype('float32')
            }
        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
        y = fluid.data(name="y", shape=[5], dtype='float32')
        z = fluid.layers.elementwise_sub(x, y, axis=3)
@@ -10980,7 +10980,7 @@ Examples:
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        z_value = exe.run(feed=gen_data(),
                            fetch_list=[z.name])
        print(z_value) # z.shape=[2,3,4,5]
@@ -11056,7 +11056,7 @@ Examples:
                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
                "y": np.random.randint(1, 5, size=[5]).astype('float32')
            }
        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
        y = fluid.data(name="y", shape=[5], dtype='float32')
        z = fluid.layers.elementwise_mul(x, y, axis=3)
@@ -11064,11 +11064,11 @@ Examples:
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        z_value = exe.run(feed=gen_data(),
                            fetch_list=[z.name])
        print(z_value) # z.shape=[2,3,4,5]
    """
    if in_dygraph_mode():
        return _elementwise_op_in_dygraph(
@@ -11328,7 +11328,7 @@ for func in []:
 Examples:
  .. code-block:: python
    import paddle.fluid as fluid
    # example 1: shape(x) = (2, 3, 4, 5), shape(y) = (2, 3, 4, 5)
    x0 = fluid.layers.data(name="x0", shape=[2, 3, 4, 5], dtype='float32')
@@ -11389,7 +11389,7 @@ def logical_and(x, y, out=None, name=None):
    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
    Each element of Out is calculated by
    .. math::
        Out = X \land Y
@@ -11438,7 +11438,7 @@ def logical_or(x, y, out=None, name=None):
    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
    Each element of Out is calculated by
    .. math::
        Out = X \lor Y
@@ -11487,7 +11487,7 @@ def logical_xor(x, y, out=None, name=None):
    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
    Each element of Out is calculated by
    .. math::
        Out = (X \lor Y) \land \lnot (X \land Y)
@@ -11536,7 +11536,7 @@ def logical_not(x, out=None, name=None):
    It operates element-wise on X, and returns the Out. X and Out are N-dim boolean LoDTensor or Tensor.
    Each element of Out is calculated by
    .. math::
        Out = \lnot X
@@ -11584,8 +11584,8 @@ def clip(x, min, max, name=None):
        x(${x_type}): ${x_comment}
        min(float): ${min_comment}
        max(float): ${max_comment}
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -11630,9 +11630,9 @@ def clip_by_norm(x, max_norm, name=None):
    Args:
        x(${x_type}): ${x_comment}
        max_norm(${max_norm_type}): ${max_norm_comment}
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            to :ref:`api_guide_Name`. Usually name is no need to set and
-            None by default. 
+            None by default.
    Returns:
        Variable:
@@ -11650,6 +11650,8 @@ def clip_by_norm(x, max_norm, name=None):
    """
    helper = LayerHelper("clip_by_norm", **locals())
+    check_variable_and_dtype(x, 'X', ['float32'], 'clip_by_norm')
+    check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
    if name is None:
        name = unique_name.generate_with_ignorable_key(".".join(
@@ -11747,23 +11749,23 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
    Args:
        x (Variable): The first input Tensor/LoDTensor of mul_op.
        y (Variable): The second input Tensor/LoDTensor of mul_op.
-        x_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. If the input $x$ is a tensor with more than two dimensions, $x$ will be flattened into a two-dimensional matrix first. The flattening rule is: the first `num_col_dims` will be flattened to form the first dimension of the final matrix (the height of the matrix), and the rest `rank(x) - num_col_dims` dimensions are flattened to form the second dimension of the final matrix (the width of the matrix). As a result, height of the flattened matrix is equal to the product of $x$'s first `x_num_col_dims` dimensions' sizes, and width of the flattened matrix is equal to the product of $x$'s last `rank(x) - num_col_dims` dimensions' size. For example, suppose $x$ is a 6-dimensional tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3. Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default is 1. 
+        x_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. If the input $x$ is a tensor with more than two dimensions, $x$ will be flattened into a two-dimensional matrix first. The flattening rule is: the first `num_col_dims` will be flattened to form the first dimension of the final matrix (the height of the matrix), and the rest `rank(x) - num_col_dims` dimensions are flattened to form the second dimension of the final matrix (the width of the matrix). As a result, height of the flattened matrix is equal to the product of $x$'s first `x_num_col_dims` dimensions' sizes, and width of the flattened matrix is equal to the product of $x$'s last `rank(x) - num_col_dims` dimensions' size. For example, suppose $x$ is a 6-dimensional tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3. Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default is 1.
-        y_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. If the input $y$ is a tensor with more than two dimensions, $y$ will be flattened into a two-dimensional matrix first. The attribute `y_num_col_dims` determines how $y$ is flattened. See comments of `x_num_col_dims` for more details. Default is 1. 
+        y_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. If the input $y$ is a tensor with more than two dimensions, $y$ will be flattened into a two-dimensional matrix first. The attribute `y_num_col_dims` determines how $y$ is flattened. See comments of `x_num_col_dims` for more details. Default is 1.
-        name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None. 
+        name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
    Returns:
        Variable(Tensor/LoDTensor): The output Tensor/LoDTensor of mul op.
    Examples:
        ..  code-block:: python
            import paddle.fluid as fluid
            dataX = fluid.layers.data(name="dataX", append_batch_size = False, shape=[2, 5], dtype="float32")
            dataY = fluid.layers.data(name="dataY", append_batch_size = False, shape=[5, 3], dtype="float32")
            output = fluid.layers.mul(dataX, dataY,
                                      x_num_col_dims = 1,
                                      y_num_col_dims = 1)
    """
    if in_dygraph_mode():
@@ -11792,8 +11794,8 @@ def maxout(x, groups, name=None, axis=1):
        x(${x_type}): ${x_comment}
        groups(int): ${groups_comment}
        axis(int, optional): ${axis_comment}
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
    Returns:
@@ -11808,8 +11810,8 @@ def maxout(x, groups, name=None, axis=1):
            import paddle.fluid as fluid
            input = fluid.data(
-                name='data', 
+                name='data',
-                shape=[None, 256, 32, 32], 
+                shape=[None, 256, 32, 32],
                dtype='float32')
            out = fluid.layers.maxout(input, groups=2)
    """
@@ -11888,7 +11890,7 @@ def space_to_depth(x, blocksize, name=None):
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            import numpy as np
@@ -11955,10 +11957,10 @@ def affine_channel(x,
        bias (Variable): 1D input of shape (C), the c-th element is the bias
            of the affine transformation for the c-th channel of the input.
            The data type is float32 or float64.
-        data_layout (str, optional): Specify the data format of the input, and the data format of the output 
+        data_layout (str, optional): Specify the data format of the input, and the data format of the output
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`. If input is 2D Tensor, you can ignore 
+            `[batch_size, input_channels, input_height, input_width]`. If input is 2D Tensor, you can ignore
            data_layout.
        name (str, default None): The name of this layer. For more information,
            please refer to :ref:`api_guide_Name` .
@@ -12081,7 +12083,7 @@ def similarity_focus(input, axis, indexes, name=None):
    Args:
        input(Variable): The input tensor variable(default float). It should
-            be a 4-D tensor with shape [BatchSize, A, B, C]. Data type is 
+            be a 4-D tensor with shape [BatchSize, A, B, C]. Data type is
            float32 or float64.
        axis(int): Indicating the dimension to be selected. It can only be
            1, 2 or 3.
@@ -12187,7 +12189,7 @@ def grid_sampler(x, grid, name=None):
    with shape [N, H, W] each, where x is indexing the 4th dimension
    (in width dimension) of input data x and y is indexing the 3rd
    dimension (in height dimension), finally results is the bilinear
-    interpolation value of 4 nearest corner points. The output tensor 
+    interpolation value of 4 nearest corner points. The output tensor
    shape will be [N, C, H, W].
    .. code-block:: text
@@ -12296,10 +12298,10 @@ def log_loss(input, label, epsilon=1e-4, name=None):
                                batch size. This input is a probability computed
                                by the previous operator. Data type float32.
        label (Variable|list):  The ground truth which is a 2-D tensor with
-                                shape [N x 1], where N is the batch size. 
+                                shape [N x 1], where N is the batch size.
                                Data type float32.
        epsilon (float, optional): A small number for numerical stability. Default 1e-4.
-        name(str|None): For detailed information, please refer to 
+        name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Returns:
@@ -12331,7 +12333,7 @@ def add_position_encoding(input, alpha, beta, name=None):
    This operator performs weighted sum of input feature at each position
    (position in the sequence) and the corresponding position encoding.
-    For more details of position encoding, please refer to `Attention Is All You 
+    For more details of position encoding, please refer to `Attention Is All You
    Need <http://arxiv.org/pdf/1706.03762.pdf>`_ .
    The formula is as follows:
@@ -12356,8 +12358,8 @@ def add_position_encoding(input, alpha, beta, name=None):
            weighted sum.
        beta(float): Indicate the weight coefficient for position encoding when
            performing weighted sum.
-        name(str, optional): For detailed information, please refer 
+        name(str, optional): For detailed information, please refer
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
    Returns:
@@ -12414,19 +12416,19 @@ def bilinear_tensor_product(x,
      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
    Args:
-        x (Variable): 2-D input tensor with shape [batch_size, M]. Data type 
+        x (Variable): 2-D input tensor with shape [batch_size, M]. Data type
            is float32 or float64.
-        y (Variable): 2-D input tensor with shape [batch_size, N]. Data type 
+        y (Variable): 2-D input tensor with shape [batch_size, N]. Data type
            should be same as **x**.
        size (int): The dimension of this layer.
        act (str|None): Activation to be applied to the output of this layer. Default None.
-        name(str|None): For detailed information, please refer to 
+        name(str|None): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
-        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
+        param_attr (ParamAttr|None): To specify the weight parameter attribute.
-            Default: None, which means the default weight parameter property is 
+            Default: None, which means the default weight parameter property is
            used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        bias_attr (ParamAttr|None): To specify the bias parameter attribute. 
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute.
-            Default: None, which means the default bias parameter property is 
+            Default: None, which means the default bias parameter property is
            used. See usage for details in :ref:`api_fluid_ParamAttr` .
    Returns:
        Variable: A 2-D Tensor of shape [batch_size, size]. Data type is the same as input **x**.
@@ -12491,7 +12493,7 @@ def get_tensor_from_selected_rows(x, name=None):
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
            b = fluid.default_main_program().global_block()
            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
@@ -12516,7 +12518,7 @@ def shuffle_channel(x, group, name=None):
    Please refer to the paper
    https://arxiv.org/pdf/1707.01083.pdf
    .. code-block:: text
        Given a 4-D tensor input with the shape (N, C, H, W):
@@ -12537,22 +12539,22 @@ def shuffle_channel(x, group, name=None):
            out.shape = (1, 4, 2, 2)
            out.data = [[[[0.1, 0.2],
                          [0.2, 0.3]],
                         [[0.5, 0.6],
                          [0.6, 0.7]],
                         [[0.3, 0.4],
                          [0.4, 0.5]],
                         [[0.7, 0.8],
                          [0.8, 0.9]]]]
-    Args: 
+    Args:
        x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W]
        group(int): Indicating the counts of subgroups, It should divide the number of channels.
    Returns:
-        out(Variable): the channels shuffling result is a tensor variable with the 
+        out(Variable): the channels shuffling result is a tensor variable with the
        same shape and same type as the input.
    Raises:
@@ -12584,10 +12586,10 @@ def shuffle_channel(x, group, name=None):
 def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
    """
    **Temporal Shift Operator**
    ${comment}
-    Args: 
+    Args:
        x(Variable): ${x_comment}
        seg_num(int): ${seg_num_comment}
        shift_ratio(float): ${shift_ratio_comment}
@@ -12596,7 +12598,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
                             None by default.
    Returns:
-        out(Variable): The temporal shifting result is a tensor variable with the 
+        out(Variable): The temporal shifting result is a tensor variable with the
        same shape and same data type as the input.
    Raises:
@@ -12705,23 +12707,23 @@ class PyFuncRegistry(object):
 @templatedoc()
 def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
    """
-    This OP is used to register customized Python OP to Paddle Fluid. The design 
+    This OP is used to register customized Python OP to Paddle Fluid. The design
    principe of py_func is that LodTensor and numpy array can be converted to each
    other easily. So you can use Python and numpy API to register a python OP.
-    The forward  function of the registered OP is ``func`` and the backward function 
+    The forward  function of the registered OP is ``func`` and the backward function
-    of that is  ``backward_func``. Paddle will call ``func`` at forward runtime and 
+    of that is  ``backward_func``. Paddle will call ``func`` at forward runtime and
-    call ``backward_func`` at backward runtime(if ``backward_func`` is not  None). 
+    call ``backward_func`` at backward runtime(if ``backward_func`` is not  None).
-    ``x`` is the input of ``func``, whose type must be LoDTensor; ``out`` is 
+    ``x`` is the input of ``func``, whose type must be LoDTensor; ``out`` is
    the output of ``func``, whose type can be either LoDTensor or numpy array.
-    The input of the backward function ``backward_func`` is ``x``, ``out`` and 
+    The input of the backward function ``backward_func`` is ``x``, ``out`` and
-    the gradient of ``out``. If some variables of ``out`` have no gradient, the 
+    the gradient of ``out``. If some variables of ``out`` have no gradient, the
-    relevant input variable of ``backward_func`` is None. If some variables of 
+    relevant input variable of ``backward_func`` is None. If some variables of
    ``x`` do not have a gradient, the user should return None in ``backward_func``.
-    The data type and shape of ``out`` should also be set correctly before this 
+    The data type and shape of ``out`` should also be set correctly before this
-    API is called, and the data type and shape of the gradient of ``out`` and 
+    API is called, and the data type and shape of the gradient of ``out`` and
    ``x`` will be inferred automatically.
    This API can also be used to debug the neural network by setting the ``func``
@@ -12729,35 +12731,35 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
    Args:
        func (callable): The forward function of the registered OP. When the network
-            is running, the forward output ``out`` will be calculated according to this 
+            is running, the forward output ``out`` will be calculated according to this
-            function and the forward input ``x``. In ``func`` , it's suggested that we 
+            function and the forward input ``x``. In ``func`` , it's suggested that we
            actively convert LoDTensor into a numpy array, so that we can use Python and
            numpy API arbitrarily. If not, some operations of numpy may not be compatible.
-        x (Variable|tuple(Variale)|list[Variale]): The input of the forward function ``func``. 
+        x (Variable|tuple(Variale)|list[Variale]): The input of the forward function ``func``.
-            It can be Variable|tuple(Variale)|list[Variale], where Variable is LoDTensor or 
+            It can be Variable|tuple(Variale)|list[Variale], where Variable is LoDTensor or
            Tenosor. In addition, Multiple Variable should be passed in the form of tuple(Variale)
            or list[Variale].
-        out (Variable|tuple(Variale)|list[Variale]): The output of the forward function ``func``, 
+        out (Variable|tuple(Variale)|list[Variale]): The output of the forward function ``func``,
            it can be Variable|tuple(Variale)|list[Variale], where Variable can be either LoDTensor
-            or numpy array. Since Paddle cannot automatically infer the shape and type of ``out``, 
+            or numpy array. Since Paddle cannot automatically infer the shape and type of ``out``,
            you must create ``out`` in advance.
-        backward_func (callable, optional): The backward function of the registered OP. 
+        backward_func (callable, optional): The backward function of the registered OP.
-            Its default value is None, which means there is no reverse calculation. If 
+            Its default value is None, which means there is no reverse calculation. If
-            it is not None, ``backward_func`` is called to calculate the gradient of 
+            it is not None, ``backward_func`` is called to calculate the gradient of
            ``x`` when the network is at backward runtime.
-        skip_vars_in_backward_input (Variable, optional): It's used to limit the input 
+        skip_vars_in_backward_input (Variable, optional): It's used to limit the input
-            variable list of ``backward_func``, and it can be Variable|tuple(Variale)|list[Variale]. 
+            variable list of ``backward_func``, and it can be Variable|tuple(Variale)|list[Variale].
-            It must belong to either ``x`` or ``out``. The default  value is None, which means 
+            It must belong to either ``x`` or ``out``. The default  value is None, which means
-            that no variables need to be removed from ``x`` and ``out``. If it is not None, 
+            that no variables need to be removed from ``x`` and ``out``. If it is not None,
-            these variables will not be the input of ``backward_func``. This parameter is only 
+            these variables will not be the input of ``backward_func``. This parameter is only
            useful when ``backward_func`` is not None.
-    Returns: 
+    Returns:
        Variable|tuple(Variale)|list[Variale]: The output ``out`` of the forward function ``func``.
    Examples:
        .. code-block:: python
            # example 1:
            import paddle.fluid as fluid
            import six
@@ -12768,15 +12770,15 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
                return np.tanh(x)
            # Skip x in backward function and return the gradient of x
-            # LodTensor must be actively converted to numpy array, otherwise, 
+            # LodTensor must be actively converted to numpy array, otherwise,
            # operations such as +/- can't be used.
            def tanh_grad(y, dy):
                return np.array(dy) * (1 - np.square(np.array(y)))
            # Creates a forward function for debugging running networks(print value)
            def debug_func(x):
                print(x)
            def create_tmp_var(name, dtype, shape):
                return fluid.default_main_program().current_block().create_var(
                    name=name, dtype=dtype, shape=shape)
@@ -12788,7 +12790,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
                    new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
                        dtype=hidden.dtype, shape=hidden.shape)
-                    # User-defined forward and backward 
+                    # User-defined forward and backward
                    hidden = fluid.layers.py_func(func=tanh, x=hidden,
                        out=new_hidden, backward_func=tanh_grad,
                        skip_vars_in_backward_input=hidden)
@@ -12800,16 +12802,16 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
                loss = fluid.layers.cross_entropy(input=prediction, label=label)
                return fluid.layers.mean(loss)
-            # example 2: 
+            # example 2:
-            # This example shows how to turn LoDTensor into numpy array and 
+            # This example shows how to turn LoDTensor into numpy array and
            # use numpy API to register an Python OP
            import paddle.fluid as fluid
            import numpy as np
-            def element_wise_add(x, y): 
+            def element_wise_add(x, y):
-                # LodTensor must be actively converted to numpy array, otherwise, 
+                # LodTensor must be actively converted to numpy array, otherwise,
                # numpy.shape can't be used.
-                x = np.array(x)    
+                x = np.array(x)
                y = np.array(y)
                if x.shape != y.shape:
@@ -12833,7 +12835,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
                # Input of the forward function
                x = fluid.data(name='x', shape=[2,3], dtype='int32')
                y = fluid.data(name='y', shape=[2,3], dtype='int32')
                # Output of the forward function, name/dtype/shape must be specified
                output = create_tmp_var('output','int32', [3,1])
@@ -12846,7 +12848,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
                # Feed numpy array to main_program
                input1 = np.random.randint(1, 10, size=[2,3], dtype='int32')
                input2 = np.random.randint(1, 10, size=[2,3], dtype='int32')
-                out = exe.run(main_program, 
+                out = exe.run(main_program,
                            feed={'x':input1, 'y':input2},
                            fetch_list=[output.name])
                print("{0} + {1} = {2}".format(input1, input2, out))
@@ -12945,8 +12947,8 @@ def psroi_pool(input,
        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
        pooled_height (int): ${pooled_height_comment} Default: 1
        pooled_width (int): ${pooled_width_comment} Default: 1
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
@@ -13016,8 +13018,8 @@ def prroi_pool(input,
                             Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
        pooled_height (integer): The pooled output height. Default: 1.
        pooled_width (integer): The pooled output width. Default: 1.
-        batch_roi_nums (Variable): The number of roi for each image in batch. It 
+        batch_roi_nums (Variable): The number of roi for each image in batch. It
-                         should be 1-D Tensor, with shape [N] and dtype int64, 
+                         should be 1-D Tensor, with shape [N] and dtype int64,
                         where N is the batch size. Default: None. Be note: The lod of input should be
                         empty when batch_roi_nums has values;
        name (str, default None): The name of this operation.
@@ -13033,7 +13035,7 @@ def prroi_pool(input,
            x = fluid.data(name='x', shape=[None, 490, 28, 28], dtype='float32')
            rois = fluid.data(name='rois', shape=[None, 4], lod_level=1, dtype='float32')
            pool_out = fluid.layers.prroi_pool(x, rois, 1.0, 7, 7)
            ## prroi_pool with batch_roi_num
            batchsize=4
            x2 = fluid.data(name='x2', shape=[batchsize, 490, 28, 28], dtype='float32')
@@ -13075,7 +13077,7 @@ def pixel_shuffle(x, upscale_factor):
    to a tensor of shape [N, C/r**2, H*r, W*r].
    This is useful for implementing efficient sub-pixel convolution
    with a stride of 1/r.
-    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution 
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution
    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
    by Shi et. al (2016) for more details.
@@ -13101,13 +13103,13 @@ def pixel_shuffle(x, upscale_factor):
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
 	    input_data = np.random.rand(2,9,4,4).astype("float32")
 	    output_data = exe.run(fluid.default_main_program(),
                feed={"input":input_data},
                fetch_list=[output],
                return_numpy=True)
 	    # print(output.shape)
 	    # (2L, 1L, 12L, 12L)
@@ -13242,7 +13244,7 @@ def where(condition):
        condition(Variable): A bool tensor with rank at least 1, the data type is bool.
    Returns:
-        Variable, the output data type is int64. : The tensor variable storing a 2-D tensor, which involves all coordinate. 
+        Variable, the output data type is int64. : The tensor variable storing a 2-D tensor, which involves all coordinate.
    Examples:
        .. code-block:: python
@@ -13298,7 +13300,7 @@ def sign(x):
          import numpy as np
          # [1.0, 0.0, -1.0]
-          data = fluid.layers.sign(np.array([3.0, 0.0, -2.0], dtype='float32')) 
+          data = fluid.layers.sign(np.array([3.0, 0.0, -2.0], dtype='float32'))
    """
    helper = LayerHelper("sign", **locals())
@@ -13315,7 +13317,7 @@ def sign(x):
 def unique(x, dtype='int32'):
    """
-    **unique** 
+    **unique**
    Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
@@ -13357,7 +13359,7 @@ def unique(x, dtype='int32'):
 def unique_with_counts(x, dtype='int32'):
    """
    This OP return a unique tensor for `x` , and count tensor that the count of unique result in raw input, \
-    and an index tensor pointing to this unique tensor. 
+    and an index tensor pointing to this unique tensor.
    **NOTICE**: This op support the variable type of Tensor only.
@@ -13365,7 +13367,7 @@ def unique_with_counts(x, dtype='int32'):
        x(Variable): A 1-D input tensor with input shape of :math:`[N]` , the input data type is float32, float64, int32, int64.
        dtype(np.dtype|core.VarDesc.VarType|str): The type of count and index tensor, it could be int32, int64. Defalut value is int32.
-    Returns: 
+    Returns:
        tuple, the variable type in tuple is Tensor, the output :attr:`out` data type is the same as input :attr:`x`, \
        and data type of output :attr:`index` and :attr:`count` will be int32 or int64.: The :attr:`out` is unique tensor for input :attr:`x`,\
        the data shape is :math:`[K]`, the `K` may be different to the `N` in shape of :attr:`x`. :attr:`index` is an index tensor pointing\
@@ -13432,24 +13434,24 @@ def deformable_conv(input,
    Compute 2-D deformable convolution on 4-D input.
    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
-    Deformable Convolution v2: 
+    Deformable Convolution v2:
    .. math::
        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
    Deformable Convolution v1:
    .. math::
        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}
-    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location, 
+    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
    Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
    <https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
    Example:
        - Input:
@@ -13501,7 +13503,7 @@ def deformable_conv(input,
            connected to the second half of the input channels. Default: groups=1.
        deformable_groups (int): The number of deformable group partitions.
            Default: deformable_groups = 1.
-        im2col_step (int): Maximum number of images per im2col computation; 
+        im2col_step (int): Maximum number of images per im2col computation;
            The total batch size should be devisable by this value or smaller
            than this value; if you face out of memory problem, you can try
            to use a smaller value here.
@@ -13510,7 +13512,7 @@ def deformable_conv(input,
            of deformable conv. If it is set to None or one attribute of ParamAttr,
            deformable conv will create ParamAttr as param_attr.
            If the Initializer of the param_attr is not set, the parameter is
-            initialized with :math:`Normal(0.0, std)`, and the 
+            initialized with :math:`Normal(0.0, std)`, and the
            :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
        bias_attr (ParamAttr|bool, Optional): The parameter attribute for the bias of
            deformable conv layer. If it is set to False, no bias will be added
@@ -13531,7 +13533,7 @@ def deformable_conv(input,
        .. code-block:: python
          #deformable conv v2:
          import paddle.fluid as fluid
          C_in, H_in, W_in = 3, 32, 32
          filter_size, deformable_groups = 3, 1
@@ -13659,7 +13661,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
    Parameters:
-        x(Varaible):              4-D Tensor, input tensor of format [N, C, H, W], 
+        x(Varaible):              4-D Tensor, input tensor of format [N, C, H, W],
                                  data type can be float32 or float64
        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
                                  or an integer k treated as [k, k].
@@ -13676,16 +13678,16 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
        dilations(int|list):      the dilations of convolution kernel, should be
                                  [dilation_h, dilation_w], or an integer dilation treated as
                                  [dilation, dilation]. For default, it will be [1, 1].
-        name(str, optional): The default value is None.  
+        name(str, optional): The default value is None.
-                             Normally there is no need for user to set this property.  
+                             Normally there is no need for user to set this property.
                             For more information, please refer to :ref:`api_guide_Name`
    Returns:
-        The tensor variable corresponding to the sliding local blocks. 
+        The tensor variable corresponding to the sliding local blocks.
-        The output shape is [N, Cout, Lout] as decriabled above. 
+        The output shape is [N, Cout, Lout] as decriabled above.
-        Cout is the  total number of values within each block, 
+        Cout is the  total number of values within each block,
-        and Lout is the total number of such blocks. 
+        and Lout is the total number of such blocks.
        The data type of output is the same as the input :math:`x`
    Return Type:
@@ -13768,21 +13770,21 @@ def deformable_roi_pooling(input,
                           name=None):
    """
    Deformable ROI Pooling Layer
    Performs deformable region-of-interest pooling on inputs. As described
-    in `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_, it will get offset for each bin after 
+    in `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_, it will get offset for each bin after
    roi pooling so that pooling at correct region. Batch_size will change to the number of region bounding boxes after deformable_roi_pooling.
    The operation has three steps:
    1. Dividing each region proposal into equal-sized sections with the pooled_width and pooled_height.
    2. Add offset to pixel in ROI to get new location and the new value which are computed directly through
       bilinear interpolation with four nearest pixel.
    3. Sample several points in each bin to get average values as output.
    Args:
        input (Variable):The input of deformable roi pooling and it is tensor which value type is float32. The shape of input is
                         [N, C, H, W]. Where N is batch size, C is number of input channels,
@@ -13792,14 +13794,14 @@ def deformable_roi_pooling(input,
                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
                         the top left coordinates, and (x2, y2) is the bottom
                         right coordinates, which value type is float32.
-        trans (Variable): Offset of features on ROIs while pooling which value type is float32. The format is [N, C, H, W], where 
+        trans (Variable): Offset of features on ROIs while pooling which value type is float32. The format is [N, C, H, W], where
-                          N is number of ROIs, C is number of channels, which indicate the offset distance 
+                          N is number of ROIs, C is number of channels, which indicate the offset distance
-                          in the x and y directions, H is pooled height, and W is pooled width. 
+                          in the x and y directions, H is pooled height, and W is pooled width.
        no_trans (bool): Whether to add offset to get new value or not while roi pooling, which value with type bool is True or False.
                         If value is True, no offset will be added in operation. Default: False.
        spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width), which value type is float32.
                         Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
-        group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels 
+        group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels
                          is k1 * k2 * (C + 1), which k1 and k2 are group width and height and C+1 is number of output
                          channels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
        pooled_height (int): The pooled output height which value type is int32. Default: 1.
@@ -13821,50 +13823,50 @@ def deformable_roi_pooling(input,
        # position_sensitive=True
        import paddle.fluid as fluid
        input = fluid.data(name="input",
-                           shape=[2, 192, 64, 64], 
+                           shape=[2, 192, 64, 64],
-                           dtype='float32')                   
+                           dtype='float32')
        rois = fluid.data(name="rois",
                          shape=[-1, 4],
-                          dtype='float32', 
+                          dtype='float32',
                          lod_level=1)
        trans = fluid.data(name="trans",
-                           shape=[2, 384, 64, 64], 
+                           shape=[2, 384, 64, 64],
-                           dtype='float32') 
+                           dtype='float32')
-        x = fluid.layers.deformable_roi_pooling(input=input, 
+        x = fluid.layers.deformable_roi_pooling(input=input,
-                                                rois=rois, 
+                                                rois=rois,
-                                                trans=trans, 
+                                                trans=trans,
                                                no_trans=False,
-                                                spatial_scale=1.0, 
+                                                spatial_scale=1.0,
                                                group_size=(1, 1),
                                                pooled_height=8,
                                                pooled_width=8,
                                                part_size=(8, 8),
-                                                sample_per_part=4, 
+                                                sample_per_part=4,
                                                trans_std=0.1,
                                                position_sensitive=True)
        # position_sensitive=False
        import paddle.fluid as fluid
        input = fluid.data(name="input",
-                           shape=[2, 192, 64, 64], 
+                           shape=[2, 192, 64, 64],
-                           dtype='float32')                   
+                           dtype='float32')
        rois = fluid.data(name="rois",
                          shape=[-1, 4],
-                          dtype='float32', 
+                          dtype='float32',
                          lod_level=1)
        trans = fluid.data(name="trans",
-                           shape=[2, 384, 64, 64], 
+                           shape=[2, 384, 64, 64],
-                           dtype='float32') 
+                           dtype='float32')
-        x = fluid.layers.deformable_roi_pooling(input=input, 
+        x = fluid.layers.deformable_roi_pooling(input=input,
-                                                rois=rois, 
+                                                rois=rois,
-                                                trans=trans, 
+                                                trans=trans,
                                                no_trans=False,
-                                                spatial_scale=1.0, 
+                                                spatial_scale=1.0,
                                                group_size=(1, 1),
                                                pooled_height=8,
                                                pooled_width=8,
                                                part_size=(8, 8),
-                                                sample_per_part=4, 
+                                                sample_per_part=4,
                                                trans_std=0.1,
                                                position_sensitive=False)
    """
@@ -13925,8 +13927,8 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
    the `shard_id` matches the shard with the input index inside, the index is
    recomputed on the basis of the shard offset, elsewise it is set to
    `ignore_value`. The detail is as follows:
-    :: 
+    ::
        shard_size = (index_num + nshards - 1) // nshards
        y = x % shard_size if x // shard_size == shard_id else ignore_value
@@ -13935,22 +13937,22 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
    Examples:
    ::
        Input:
          X.shape = [4, 1]
          X.data = [[1], [6], [12], [19]]
          index_num = 20
          nshards = 2
          ignore_value = -1
        if shard_id == 0, we get:
          Out.shape = [4, 1]
          Out.data = [[1], [6], [-1], [-1]]
        if shard_id == 1, we get:
          Out.shape = [4, 1]
          Out.data = [[-1], [-1], [2], [9]]
    Args:
        - **input** (Variable): Input indices, last dimension must be 1.
        - **index_num** (scalar): An integer defining the range of the index.
@@ -14015,26 +14017,26 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
        threshold (float, optional): The threshold in Relu function. Default: 6.0
        scale (float, optional): The scale factor. Default: 6.0
        offset (float, optional): The offset factor. Default: 3.0
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` 
+        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
    Returns:
        Variable: The output tensor with the same shape and data type as input.
    Examples:
    .. code-block:: python
        import paddle.fluid as fluid
        import numpy as np
        DATATYPE='float32'
        x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)
        x = fluid.data(name="x", shape=[None,1,4], dtype=DATATYPE)
        y = fluid.layers.hard_swish(x)
        place = fluid.CPUPlace()
        #place = fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
@@ -14082,8 +14084,8 @@ def gather_tree(ids, parents):
                           [[0 0]
                            [0 1]]]
-            Then:                
+            Then:
-                gather_tree(ids, parents)  
+                gather_tree(ids, parents)
                         = [[[2 2]
                             [1 6]]
                            [[3 3]
@@ -14139,17 +14141,17 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
    Examples:
    ::
        Input:
          shape = [1, 2]
        Output:
          result=[[0.8505902, 0.8397286]]
    Args:
-        shape (list|tuple|Variable): The shape of the output Tensor,  if the shape is a list or tuple, 
+        shape (list|tuple|Variable): The shape of the output Tensor,  if the shape is a list or tuple,
                                     its elements can be an integer
-                                     or a Tensor with the shape [1], and the type of the Tensor must be int32 or int64. 
+                                     or a Tensor with the shape [1], and the type of the Tensor must be int32 or int64.
                                     If the shape is a Variable, it is a 1-D Tensor, and the type of the Tensor must be int32 or int64.
        dtype(np.dtype|core.VarDesc.VarType|str, optional): The type of the output Tensor. Supported data types: float32, float64.
                                                  Default: float32.
@@ -14160,12 +14162,12 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
            operator will always generate the same random numbers every time.
            Default 0.
-    Returns: 
+    Returns:
        Variable: A Tensor of the specified shape filled with uniform_random values.
    Raises:
        TypeError: The shape type should be list or tuple or variable.
    Examples:
        .. code-block:: python
@@ -14187,7 +14189,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
            result_3 = fluid.layers.uniform_random(var_shape)
            var_shape_int32 = fluid.data(name='var_shape_int32', shape=[2], dtype="int32")
            result_4 = fluid.layers.uniform_random(var_shape_int32)
    """