From 78380587dcb52fd66a1dedbb35544592baa5632a Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Mon, 7 Dec 2020 11:47:12 +0800
Subject: [PATCH] Fix api docs in RNN, Transformer, layer_norm,
 WeightNormParamAttr (#29235) (#29407)

* Fix api docs in RNN, Transformer, layer_norm, WeightNormParamAttr.
test=develop

* Fix api doc for print in label_smooth.
test=develop

* Update api docs according to review comments.
Add name argument in RNN back.
test=develop
---
 python/paddle/fluid/layers/nn.py      | 57 ++++++++++++---------------
 python/paddle/fluid/param_attr.py     |  8 ++--
 python/paddle/nn/functional/common.py |  2 +-
 python/paddle/nn/layer/transformer.py |  4 +-
 4 files changed, 32 insertions(+), 39 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index db9dd71c61..9a6d4b4f3a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -842,52 +842,52 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
 def crf_decoding(input, param_attr, label=None, length=None):
     """
     :api_attr: Static Graph
+
     ${comment}
 
     Args:
-        input(${emission_type}): ${emission_comment}
+        input(Tensor): ${emission_comment}
 
         param_attr (ParamAttr|None): To specify the weight parameter attribute.
             Default: None, which means the default weight parameter property is
-            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+            used. See usage for details in :ref:`api_paddle_fluid_param_attr_ParamAttr` .
 
         label(${label_type}, optional): ${label_comment}
 
         length(${length_type}, optional): ${length_comment}
 
     Returns:
-        Variable: ${viterbi_path_comment}
+        Tensor: ${viterbi_path_comment}
 
     Examples:
         .. code-block:: python
 
-           import paddle.fluid as fluid
            import paddle
            paddle.enable_static()
 
            # LoDTensor-based example
            num_labels = 10
-           feature = fluid.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
-           label = fluid.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
-           emission = fluid.layers.fc(input=feature, size=num_labels)
+           feature = paddle.static.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
+           label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
+           emission = paddle.static.nn.fc(feature, size=num_labels)
 
-           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label,
-                     param_attr=fluid.ParamAttr(name="crfw"))
-           crf_decode = fluid.layers.crf_decoding(input=emission,
-                     param_attr=fluid.ParamAttr(name="crfw"))
+           crf_cost = paddle.fluid.layers.linear_chain_crf(input=emission, label=label,
+                     param_attr=paddle.ParamAttr(name="crfw"))
+           crf_decode = paddle.static.nn.crf_decoding(input=emission,
+                     param_attr=paddle.ParamAttr(name="crfw"))
 
            # Common tensor example
            num_labels, max_len = 10, 20
-           feature = fluid.data(name='word_emb_pad', shape=[-1, max_len, 784], dtype='float32')
-           label = fluid.data(name='label_pad', shape=[-1, max_len, 1], dtype='int64')
-           length = fluid.data(name='length', shape=[-1, 1], dtype='int64')
-           emission = fluid.layers.fc(input=feature, size=num_labels,
+           feature = paddle.static.data(name='word_emb_pad', shape=[-1, max_len, 784], dtype='float32')
+           label = paddle.static.data(name='label_pad', shape=[-1, max_len, 1], dtype='int64')
+           length = paddle.static.data(name='length', shape=[-1, 1], dtype='int64')
+           emission = paddle.static.nn.fc(feature, size=num_labels,
                                       num_flatten_dims=2)
 
-           crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label, length=length,
-                     param_attr=fluid.ParamAttr(name="crfw_pad"))
-           crf_decode = fluid.layers.crf_decoding(input=emission, length=length,
-                     param_attr=fluid.ParamAttr(name="crfw_pad"))
+           crf_cost = paddle.fluid.layers.linear_chain_crf(input=emission, label=label, length=length,
+                     param_attr=paddle.ParamAttr(name="crfw_pad"))
+           crf_decode = paddle.static.nn.crf_decoding(input=emission, length=length,
+                     param_attr=paddle.ParamAttr(name="crfw_pad"))
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'crf_decoding')
@@ -3427,7 +3427,7 @@ def layer_norm(input,
     - :math:`b`: the trainable bias parameter.
 
     Args:
-        input(Variable): A multi-dimension ``Tensor`` , and the data type is float32 or float64.
+        input(Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64.
         scale(bool, optional): Whether to learn the adaptive gain :math:`g` after
             normalization. Default: True.
         shift(bool, optional): Whether to learn the adaptive bias :math:`b` after
@@ -3452,24 +3452,17 @@ def layer_norm(input,
         name(str): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable: ``Tensor``  indicating the normalized result, the data type is the same as  ``input`` , and the return dimension is the same as  ``input`` .
+        Tensor: ``Tensor``  indicating the normalized result, the data type is the same as  ``input`` , and the return dimension is the same as  ``input`` .
 
     Examples:
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
             import paddle
             paddle.enable_static()
-            x = fluid.data(name='x', shape=[-1, 32, 32], dtype='float32')
-            hidden1 = fluid.layers.layer_norm(input=x, begin_norm_axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            np_x = np.random.random(size=(8, 3, 32, 32)).astype('float32')
-            output = exe.run(feed={"x": np_x}, fetch_list = [hidden1])
-            print(output)
+            x = paddle.static.data(name='x', shape=[8, 32, 32], dtype='float32')
+            output = paddle.static.nn.layer_norm(input=x, begin_norm_axis=1)
+            print(output.shape)  # [8, 32, 32]
     """
     assert in_dygraph_mode(
     ) is not True, "please use LayerNorm instead of layer_norm in dygraph mode!"
@@ -9736,7 +9729,7 @@ def prelu(x, mode, param_attr=None, name=None):
     if mode not in ['all', 'channel', 'element']:
         raise ValueError('mode should be one of all, channel, element.')
     alpha_shape = [1]
-    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # NOTE(): The input of this API should be ``N,C,...`` format,
     # which means x.shape[0] is batch_size and x.shape[0] is channel.
     if mode == 'channel':
         assert len(
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 516181d913..72302d81d6 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -225,8 +225,8 @@ class WeightNormParamAttr(ParamAttr):
     Note:
         ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
         Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
-        :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
+        There are three clipping strategies: :ref:`api_paddle_nn_ClipGradByGlobalNorm` , 
+        :ref:`api_paddle_nn_ClipGradByNorm` , :ref:`api_paddle_nn_ClipGradByValue` .
         
 
     Args:
@@ -244,8 +244,8 @@ class WeightNormParamAttr(ParamAttr):
             optimizer is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
             Default 1.0.
         regularizer (WeightDecayRegularizer, optional): Regularization strategy. There are
-            two method: :ref:`api_paddle_fluid_regularizer_L1Decay` ,
-            :ref:`api_paddle_fluid_regularizer_L2DecayRegularizer`.
+            two method: :ref:`api_paddle_regularizer_L1Decay` ,
+            :ref:`api_paddle_regularizer_L2Decay`.
             If regularizer isralso set in ``optimizer``
             (such as :ref:`api_paddle_optimizer_SGD` ), that regularizer setting in
             optimizer will be ignored. Default None, meaning there is no regularization.
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 48b4e4692f..b3bdf1e95c 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1554,7 +1554,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
             paddle.disable_static()
             x = paddle.to_tensor(x_data, stop_gradient=False)
             output = paddle.nn.functional.label_smooth(x)
-            print(output.numpy())
+            print(output)
             
             #[[[0.03333334 0.93333334 0.03333334]
             #  [0.93333334 0.03333334 0.93333334]]]
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 0da00735b4..c0ca8350fa 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -643,7 +643,7 @@ class TransformerDecoderLayer(Layer):
             for linear in FFN. Otherwise, the three sub-layers all uses it as
             `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
-            in :ref:`api_fluid_ParamAttr` . 
+            in :ref:`api_paddle_fluid_param_attr_ParamAttr` . 
         bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
@@ -1199,7 +1199,7 @@ class Transformer(Layer):
                 transformer_paddle = Transformer(
                     d_model, n_head, dim_feedforward=dim_feedforward)
                 mask = transformer_paddle.generate_square_subsequent_mask(length)
-                print(mask.numpy())
+                print(mask)
 
                 # [[  0. -inf -inf -inf -inf]
                 # [  0.   0. -inf -inf -inf]
-- 
GitLab