From 4460c757a359dd6d22048f10a88200c6790a434d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=A6=E6=B8=A3=E6=88=8A?= <x19403@163.com>
Date: Fri, 16 Dec 2022 11:53:52 +0800
Subject: [PATCH] fix docstring (#49044)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix docstring:
1. 去除 python/paddle/fluid/dygraph/nn.py 中 BatchNorm 类说明中示例代码里 numpy 的使用，为参数 data_layout 说明中 "NCHW" 里的各字母含义做了具体说明；
2. 为 python/paddle/nn/functional/norm.py 中 batch_norm 函数中参数 data_format 说明中的 "NCDHW" 里的各字母含义做了具体说明；
3. 为 python/paddle/nn/layer/norm.py 中 BatchNorm、BatchNorm1D、BatchNorm2D 以及 BatchNorm3D 类中参数 data_format 说明中的 "NCDHW" 里的各字母含义做了具体说明；
4. 修正 python/paddle/nn/layer/loss.py 中 BCELoss 类的 Shapes 说明格式错误问题，以及 BCEWithLogitsLoss 类的 Shapes 说明格式错误问题和 Examples 出错问题；
5. 修改 python/paddle/nn/functional/loss.py 中 binary_cross_entropy 函数以及 binary_cross_entropy_with_logits 函数 Returns 说明的描述，使它们与其他函数说明相似；
6. 将 python/paddle/nn/decode.py 中 BeamSearchDecoder 类 `__init__` 方法 docstring 里的参数说明提前到类说明中去。

* 根据 Review 意见进行修改。

* 修正 在线文档 中提及的错误。
---
 python/paddle/fluid/dygraph/nn.py   |  6 ++---
 python/paddle/nn/decode.py          | 13 +++++++++++
 python/paddle/nn/functional/loss.py | 16 +++++++-------
 python/paddle/nn/functional/norm.py |  4 ++--
 python/paddle/nn/layer/loss.py      | 34 +++++++++++------------------
 python/paddle/nn/layer/norm.py      |  8 +++----
 6 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 698dd64abf..8a833eb7a0 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -119,7 +119,7 @@ class BatchNorm(layers.Layer):
              is not set, the bias is initialized zero. Default: None.
         dtype(str, optional): Indicate the data type of the input ``Tensor``,
              which can be float32 or float64. Default: float32.
-        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
         in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
         moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
         moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
@@ -140,11 +140,11 @@ class BatchNorm(layers.Layer):
     Examples:
         .. code-block:: python
 
+          import paddle
           import paddle.fluid as fluid
           from paddle.fluid.dygraph.base import to_variable
-          import numpy as np
 
-          x = np.random.random(size=(3, 10, 3, 7)).astype('float32')
+          x = paddle.rand([3, 10, 3, 7], 'float32')
           with fluid.dygraph.guard():
               x = to_variable(x)
               batch_norm = fluid.BatchNorm(10)
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index 804849ddc9..b5eedf767e 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -153,6 +153,19 @@ class BeamSearchDecoder(Decoder):
         :code:`BeamSearchDecoder.tile_beam_merge_with_batch` . The most common case
         for this is the encoder output in attention mechanism.
 
+    Parameters:
+        cell (RNNCellBase): An instance of `RNNCellBase` or object with the same interface.
+        start_token (int): The start token id.
+        end_token (int): The end token id.
+        beam_size (int): The beam width used in beam search.
+        embedding_fn (optional): A callable to apply to selected candidate ids.
+            Mostly it is an embedding layer to transform ids to embeddings,
+            and the returned value acts as the `input` argument for `cell.call`.
+            If not provided, the id to embedding transformation must be built into
+            `cell.call`. Default None.
+        output_fn (optional): A callable to apply to the cell's output prior to
+            calculate scores and select candidate token ids. Default None.
+
     Returns:
         BeamSearchDecoder: An instance of decoder which can be used in \
             `paddle.nn.dynamic_decode` to implement decoding.
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f894c7bfcb..77fd38b44d 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -592,7 +592,7 @@ def binary_cross_entropy(
     input, label, weight=None, reduction='mean', name=None
 ):
     """
-    This op measures the binary_cross_entropy loss between input predictions ``input``
+    Measure the binary_cross_entropy loss between input predictions ``input``
     and target labels ``label`` . The binary_cross_entropy loss can be described as:
 
     If :attr:`weight` is set, the loss is:
@@ -641,7 +641,7 @@ def binary_cross_entropy(
 
 
     Returns:
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        Tensor. If ``reduction`` is ``'none'``, the shape of output is
             same as ``input`` , else the shape of output is scalar.
 
     Examples:
@@ -728,7 +728,7 @@ def binary_cross_entropy_with_logits(
     logit, label, weight=None, reduction='mean', pos_weight=None, name=None
 ):
     r"""
-    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+    Combine the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
 
     This measures the element-wise probability error in classification tasks
     in which each class is independent.
@@ -736,7 +736,7 @@ def binary_cross_entropy_with_logits(
     are not mutually exclusive. For example, a news article can be about
     politics, technology or sports at the same time or none of these.
 
-    First this operator calculate loss function as follows:
+    Firstly, calculate loss function as follows:
 
     .. math::
            Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
@@ -752,13 +752,13 @@ def binary_cross_entropy_with_logits(
     .. math::
            Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
-    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    Then, if ``weight`` or ``pos_weight`` is not None, then multiply the
     weight tensor on the loss `Out`. The ``weight`` tensor will attach different
     weight on every items in the batch. The ``pos_weight`` will attach different
     weight on the positive label of each class.
 
-    Finally, this operator applies reduce operation on the loss.
-    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    Finally, apply reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, will return the original loss `Out`.
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
     If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
 
@@ -787,7 +787,7 @@ def binary_cross_entropy_with_logits(
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+        Tensor. If ``reduction`` is ``'none'``, the shape of output is
             same as ``logit`` , else the shape of output is scalar.
 
     Examples:
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 8c6d65a15e..125dde4289 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -151,9 +151,9 @@ def batch_norm(
         weight(Tensor): The weight tensor of batch_norm, can not be None.
         bias(Tensor): The bias tensor of batch_norm can not be None.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
-        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Default False.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default "NCHW".
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC", where `N` is batch size, `C` is the number of the feature map, `D` is the depth of the feature, `H` is the height of the feature map, `W` is the width of the feature map, `L` is the length of the feature map. Default "NCHW".
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index b7408d93dd..d2248e34db 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -29,7 +29,7 @@ __all__ = []
 class BCEWithLogitsLoss(Layer):
     r"""
 
-    This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
+    Combine the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
 
     This measures the element-wise probability error in classification tasks
     in which each class is independent.
@@ -37,7 +37,7 @@ class BCEWithLogitsLoss(Layer):
     are not mutually exclusive. For example, a news article can be about
     politics, technology or sports at the same time or none of these.
 
-    First this operator calculate loss function as follows:
+    Firstly, calculate loss function as follows:
 
     .. math::
            Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
@@ -53,13 +53,13 @@ class BCEWithLogitsLoss(Layer):
         .. math::
            Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
-    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
+    Then, if ``weight`` or ``pos_weight`` is not None, then multiply the
     weight tensor on the loss `Out`. The ``weight`` tensor will attach different
     weight on every items in the batch. The ``pos_weight`` will attach different
     weight on the positive label of each class.
 
-    Finally, this operator applies reduce operation on the loss.
-    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    Finally, apply reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, will return the original loss `Out`.
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
     If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
 
@@ -82,22 +82,19 @@ class BCEWithLogitsLoss(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shapes:
-        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
-          N is batch_size, `*` means number of additional dimensions. The ``logit``
-          is usually the output of Linear layer. Available dtype is float32, float64.
-        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
-          ``logit``. The target labels which values should be numbers between 0 and 1.
-          Available dtype is float32, float64.
-        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-          same as ``logit`` , else the shape of output is scalar.
+        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`], N is batch_size, `*` means number of additional dimensions. The ``logit`` is usually the output of Linear layer. Available dtype is float32, float64.
+        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as ``logit``. The target labels which values should be numbers between 0 and 1. Available dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``logit`` , else the shape of output is scalar.
 
     Returns:
         A callable object of BCEWithLogitsLoss.
 
     Examples:
+
         .. code-block:: python
 
             import paddle
+
             logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
             bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
@@ -722,14 +719,9 @@ class BCELoss(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
-          number of additional dimensions. The input ``input`` should always
-          be the output of sigmod.  Available dtype is float32, float64.
-        - label (Tensor): 2-D tensor with the same shape as ``input``. The target
-          labels which values should be numbers between 0 and 1. Available
-          dtype is float32, float64.
-        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-          same as ``input`` , else the shape of output is scalar.
+        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means number of additional dimensions. The input ``input`` should always be the output of sigmod. Available dtype is float32, float64.
+        - label (Tensor): 2-D tensor with the same shape as ``input``. The target labels which values should be numbers between 0 and 1. Available dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is scalar.
 
     Returns:
         A callable object of BCELoss.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 82c4972494..b999f9ca3a 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -881,7 +881,7 @@ class BatchNorm(Layer):
              is not set, the bias is initialized zero. Default: None.
         dtype(str, optional): Indicate the data type of the input ``Tensor``,
              which can be float32 or float64. Default: float32.
-        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_layout(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
         in_place(bool, optional): Make the input and output of batch norm reuse memory. Default: False.
         moving_mean_name(str, optional): The name of moving_mean which store the global Mean. Default: None.
         moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. Default: None.
@@ -1169,7 +1169,7 @@ class BatchNorm1D(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL".
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC", where `N` is batch size, `C` is the number of the feature map, `L` is the length of the feature map. Default "NCL".
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
@@ -1282,7 +1282,7 @@ class BatchNorm2D(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC", where `N` is batch size, `C` is the number of the feature map, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCHW.
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
@@ -1368,7 +1368,7 @@ class BatchNorm3D(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to False, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC. Default: NCDHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC", where `N` is batch size, `C` is the number of the feature map, `D` is the depth of the feature, `H` is the height of the feature map, `W` is the width of the feature map. Default: NCDHW.
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
-- 
GitLab