Fix api docs in RNN, Transformer, layer_norm, WeightNormParamAttr (#29235)

* Fix api docs in RNN, Transformer, layer_norm, WeightNormParamAttr. test=develop * Fix api doc for print in label_smooth. test=develop * Update api docs according to review comments. Add name argument in RNN back. test=develop

Fix api docs in RNN, Transformer, layer_norm, WeightNormParamAttr (#29235)
* Fix api docs in RNN, Transformer, layer_norm, WeightNormParamAttr. test=develop * Fix api doc for print in label_smooth. test=develop * Update api docs according to review comments. Add name argument in RNN back. test=develop
8fc7f1b6 · Guo Sheng · GitHub · c940f842 · 8fc7f1b6 · 8fc7f1b6
4 changed file
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -842,52 +842,52 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
 def crf_decoding(input, param_attr, label=None, length=None):
    """
    :api_attr: Static Graph
+
    ${comment}

    Args:
-        input(${emission_type}): ${emission_comment}
+        input(Tensor): ${emission_comment}

        param_attr (ParamAttr|None): To specify the weight parameter attribute.
            Default: None, which means the default weight parameter property is
-            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+            used. See usage for details in :ref:`api_paddle_fluid_param_attr_ParamAttr` .

        label(${label_type}, optional): ${label_comment}

        length(${length_type}, optional): ${length_comment}

    Returns:
-        Variable: ${viterbi_path_comment}
+        Tensor: ${viterbi_path_comment}

    Examples:
        .. code-block:: python

-           import paddle.fluid as fluid
           import paddle
           paddle.enable_static()

           # LoDTensor-based example
           num_labels = 10
-           feature = fluid.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
-           label = fluid.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
-           emission = fluid.layers.fc(input=feature, size=num_labels)
+           feature = paddle.static.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
+           label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
+           emission = paddle.static.nn.fc(feature, size=num_labels)

-           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label,
-                     param_attr=fluid.ParamAttr(name="crfw"))
-           crf_decode = fluid.layers.crf_decoding(input=emission,
-                     param_attr=fluid.ParamAttr(name="crfw"))
+           crf_cost = paddle.fluid.layers.linear_chain_crf(input=emission, label=label,
+                     param_attr=paddle.ParamAttr(name="crfw"))
+           crf_decode = paddle.static.nn.crf_decoding(input=emission,
+                     param_attr=paddle.ParamAttr(name="crfw"))

           # Common tensor example
           num_labels, max_len = 10, 20
-           feature = fluid.data(name='word_emb_pad', shape=[-1, max_len, 784], dtype='float32')
-           label = fluid.data(name='label_pad', shape=[-1, max_len, 1], dtype='int64')
-           length = fluid.data(name='length', shape=[-1, 1], dtype='int64')
-           emission = fluid.layers.fc(input=feature, size=num_labels,
+           feature = paddle.static.data(name='word_emb_pad', shape=[-1, max_len, 784], dtype='float32')
+           label = paddle.static.data(name='label_pad', shape=[-1, max_len, 1], dtype='int64')
+           length = paddle.static.data(name='length', shape=[-1, 1], dtype='int64')
+           emission = paddle.static.nn.fc(feature, size=num_labels,
                                      num_flatten_dims=2)

-           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label, length=length,
-                     param_attr=fluid.ParamAttr(name="crfw_pad"))
-           crf_decode = fluid.layers.crf_decoding(input=emission, length=length,
-                     param_attr=fluid.ParamAttr(name="crfw_pad"))
+           crf_cost = paddle.fluid.layers.linear_chain_crf(input=emission, label=label, length=length,
+                     param_attr=paddle.ParamAttr(name="crfw_pad"))
+           crf_decode = paddle.static.nn.crf_decoding(input=emission, length=length,
+                     param_attr=paddle.ParamAttr(name="crfw_pad"))
    """
    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                             'crf_decoding')
@@ -3435,7 +3435,7 @@ def layer_norm(input,
    - :math:`b`: the trainable bias parameter.

    Args:
-        input(Variable): A multi-dimension ``Tensor`` , and the data type is float32 or float64.
+        input(Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64.
        scale(bool, optional): Whether to learn the adaptive gain :math:`g` after
            normalization. Default: True.
        shift(bool, optional): Whether to learn the adaptive bias :math:`b` after
@@ -3460,24 +3460,17 @@ def layer_norm(input,
        name(str): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .

    Returns:
-        Variable: ``Tensor``  indicating the normalized result, the data type is the same as  ``input`` , and the return dimension is the same as  ``input`` .
+        Tensor: ``Tensor``  indicating the normalized result, the data type is the same as  ``input`` , and the return dimension is the same as  ``input`` .

    Examples:

        .. code-block:: python

-            import paddle.fluid as fluid
-            import numpy as np
            import paddle
            paddle.enable_static()
-            x = fluid.data(name='x', shape=[-1, 32, 32], dtype='float32')
-            hidden1 = fluid.layers.layer_norm(input=x, begin_norm_axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            np_x = np.random.random(size=(8, 3, 32, 32)).astype('float32')
-            output = exe.run(feed={"x": np_x}, fetch_list = [hidden1])
-            print(output)
+            x = paddle.static.data(name='x', shape=[8, 32, 32], dtype='float32')
+            output = paddle.static.nn.layer_norm(input=x, begin_norm_axis=1)
+            print(output.shape)  # [8, 32, 32]
    """
    assert in_dygraph_mode(
    ) is not True, "please use LayerNorm instead of layer_norm in dygraph mode!"
@@ -9752,7 +9745,7 @@ def prelu(x, mode, param_attr=None, name=None):
    if mode not in ['all', 'channel', 'element']:
        raise ValueError('mode should be one of all, channel, element.')
    alpha_shape = [1]
-    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # NOTE(): The input of this API should be ``N,C,...`` format,
    # which means x.shape[0] is batch_size and x.shape[0] is channel.
    if mode == 'channel':
        assert len(

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -226,8 +226,8 @@ class WeightNormParamAttr(ParamAttr):
    Note:
        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
-        :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
+        There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` , 
+        :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` .
        

    Args:
@@ -245,8 +245,8 @@ class WeightNormParamAttr(ParamAttr):
            optimizer is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
            Default 1.0.
        regularizer (WeightDecayRegularizer, optional): Regularization strategy. There are
-            two method: :ref:`api_paddle_fluid_regularizer_L1Decay` ,
-            :ref:`api_paddle_fluid_regularizer_L2DecayRegularizer`.
+            two method: :ref:`api_paddle_regularizer_L1Decay` ,
+            :ref:`api_paddle_regularizer_L2Decay`.
            If regularizer isralso set in ``optimizer``
            (such as :ref:`api_paddle_optimizer_SGD` ), that regularizer setting in
            optimizer will be ignored. Default None, meaning there is no regularization.

--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1554,7 +1554,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
            paddle.disable_static()
            x = paddle.to_tensor(x_data, stop_gradient=False)
            output = paddle.nn.functional.label_smooth(x)
-            print(output.numpy())
+            print(output)
            
            #[[[0.03333334 0.93333334 0.03333334]
            #  [0.93333334 0.03333334 0.93333334]]]

--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -643,7 +643,7 @@ class TransformerDecoderLayer(Layer):
            for linear in FFN. Otherwise, the three sub-layers all uses it as
            `weight_attr` to create parameters. Default: None, which means the
            default weight parameter property is used. See usage for details
-            in :ref:`api_fluid_ParamAttr` . 
+            in :ref:`api_paddle_fluid_param_attr_ParamAttr` . 
        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
            self attention, `bias_attr[1]` would be used as `bias_attr` for
@@ -1199,7 +1199,7 @@ class Transformer(Layer):
                transformer_paddle = Transformer(
                    d_model, n_head, dim_feedforward=dim_feedforward)
                mask = transformer_paddle.generate_square_subsequent_mask(length)
-                print(mask.numpy())
+                print(mask)

                # [[  0. -inf -inf -inf -inf]
                # [  0.   0. -inf -inf -inf]