!4121 Third round of the enhancement of API comments

Merge pull request !4121 from Simson/enhancement-API

!4121 Third round of the enhancement of API comments
Merge pull request !4121 from Simson/enhancement-API
2c2fe9be · mindspore-ci-bot · Gitee · 05f405c0 · dec2e4a8 · 2c2fe9be
13 changed file
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@@ -33,7 +33,7 @@ from ..common.tensor import Tensor

 class Cell:
    """
-    Base class for all neural network.
+    Base class for all neural networks.

    A 'Cell' could be a single neural network cell, such as conv2d, relu, batch_norm, etc. or a composition of
    cells to constructing a network.
@@ -42,8 +42,8 @@ class Cell:
        In general, the autograd algorithm will automatically generate the implementation of the gradient function,
        but if bprop method is implemented, the gradient function
        will be replaced by the bprop. The bprop implementation will receive a Tensor `dout` containing the gradient
-        of the loss w.r.t. the output, and a Tensor `out` containing the forward result. The bprop need to compute the
-        gradient of the loss w.r.t. the inputs, gradient of the loss w.r.t. Parameter variables is not supported
+        of the loss w.r.t. the output, and a Tensor `out` containing the forward result. The bprop needs to compute the
+        gradient of the loss w.r.t. the inputs, gradient of the loss w.r.t. Parameter variables are not supported
        currently.

    Args:
@@ -138,7 +138,7 @@ class Cell:
        """
        Update the all child cells' self.param_prefix.

-        After invoked, can get all the cell's children's name prefix by '_param_prefix'.
+        After being invoked, it can get all the cell's children's name prefix by '_param_prefix'.
        """
        cells_name = self.cells_and_names()

@@ -147,9 +147,9 @@ class Cell:

    def update_cell_type(self, cell_type):
        """
-        Update current cell type mainly identify if quantization aware training network.
+        Update the current cell type mainly identify if quantization aware training network.

-        After invoked, can set the cell type to 'cell_type'.
+        After being invoked, it can set the cell type to 'cell_type'.
        """
        self.cell_type = cell_type

@@ -346,7 +346,7 @@ class Cell:
        Please refer to the usage in source code of `mindspore.common._Executor.compile`.

        Args:
-            params (dict): The parameters dictionary used for init data graph.
+            params (dict): The parameters dictionary used for initializing the data graph.
        """
        if params is None:
            params = self.parameters_dict()
@@ -499,7 +499,7 @@ class Cell:
        """
        Adds a child cell to the current cell.

-        Inserts a subcell with given name to current cell.
+        Inserts a subcell with a given name to the current cell.

        Args:
            child_name (str): Name of the child cell.
@@ -534,7 +534,7 @@ class Cell:

    def init_parameters_data(self, auto_parallel_mode=False):
        """
-        Init all parameters' data and replace the original saved parameters in cell.
+        Initialize all parameters and replace the original saved parameters in cell.

        Notes:
            trainable_params() and other similar interfaces may return different parameter instance after
@@ -655,7 +655,7 @@ class Cell:
        Yields parameters of this cell. If `expand` is True, yield parameters of this cell and all subcells.

        Args:
-            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, yields only parameters
+            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.

        Examples:
@@ -682,7 +682,7 @@ class Cell:

        Args:
            name_prefix (str): Namespace. Default: ''.
-            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, yields only parameters
+            expand (bool): If True, yields parameters of this cell and all subcells. Otherwise, only yield parameters
                           that are direct members of this cell. Default: True.

        Examples:
@@ -772,7 +772,7 @@ class Cell:
        return self._scope

    def generate_scope(self):
-        """Generate the scope for every cell object in the network."""
+        """Generate the scope for each cell object in the network."""
        for name, cell in self._children_scope_recursive():
            cell._set_scope(name)

@@ -819,14 +819,14 @@ class Cell:
        `mindspore.train.amp.build_train_network`.

        Note:
-            Call multiple times will overwrite the previous.
+            Multiple calls will overwrite.

        Args:
            dst_type (:class:`mindspore.dtype`): Transfer Cell to Run with dst_type.
                dst_type can be `mindspore.dtype.float16` or `mindspore.dtype.float32`.

        Raises:
-            ValueError: If dst_type is not float32 or float16.
+            ValueError: If dst_type is not float32 nor float16.
        """
        if dst_type not in (mstype.float16, mstype.float32):
            raise ValueError("dst_type should inside float32 or float16.")
@@ -871,8 +871,8 @@ class Cell:
        Set the cell to auto parallel mode.

        Note:
-            If a cell needs to use auto parallel or semi auto parallel mode for training, evaluation or prediction,
-            this interface needs to be called for the cell.
+            If a cell needs to use the auto parallel or semi auto parallel mode for training, evaluation or prediction,
+            this interface needs to be called by the cell.
        """
        self._auto_parallel_mode = True
        self.add_flags(auto_parallel=True)
@@ -890,9 +890,9 @@ class Cell:
        Set the cell backward hook function. Note that this function is only supported in Pynative Mode.

        Note:
-            fn should be defined as following code shows, `cell_name` is the name of registered cell,
-            `grad_input` is gradient passed to the cell, `grad_output` is the gradient computed and pass to
-            next cell or primitve, which may be modified and return.
+            fn should be defined as the following code. `cell_name` is the name of registered cell.
+            `grad_input` is gradient passed to the cell. `grad_output` is the gradient computed and passed to the
+            next cell or primitve, which may be modified and returned.
            >>> hook_fn(cell_name, grad_input, grad_output) -> Tensor or None

        Args:
@@ -907,7 +907,7 @@ class Cell:
        Set whether the trainable parameter is updated by parameter server.

        Note:
-            This only works when running task in parameter server mode.
+            It only works when a running task is in the parameter server mode.

        Args:
            recurse (bool): Whether sets the trainable parameters of subcells. Default: True.

--- a/mindspore/nn/layer/basic.py
+++ b/mindspore/nn/layer/basic.py
@@ -172,7 +172,7 @@ class Dense(Cell):
        bias_init (Union[Tensor, str, Initializer, numbers.Number]): The trainable bias_init parameter. The dtype is
            same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
-        activation (str): activate function applied to the output of the fully connected layer, eg. 'relu'.
+        activation (str): activate function applied to the output of the fully connected layer, eg. 'ReLU'.
            Default: None.

    Raises:

--- a/mindspore/nn/layer/container.py
+++ b/mindspore/nn/layer/container.py
@@ -236,7 +236,7 @@ class CellList(_CellListBase, Cell):
        Appends cells from a Python iterable to the end of the list.

        Raises:
-            TypeError: If the cells is not a list of subcells.
+            TypeError: If the cells are not a list of subcells.
        """
        if not isinstance(cells, list):
            raise TypeError('Cells {} should be list of subcells'.format(cells))

--- a/mindspore/nn/layer/conv.py
+++ b/mindspore/nn/layer/conv.py
@@ -111,11 +111,11 @@ class Conv2d(_Conv):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

-    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
+    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
+    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
-    :math:`\text{ks_w}` are height and width of the convolution kernel. The full kernel has shape
+    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -132,7 +132,7 @@ class Conv2d(_Conv):
        in_channels (int): The number of input channel :math:`C_{in}`.
        out_channels (int): The number of output channel :math:`C_{out}`.
        kernel_size (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the height
-            and width of the 2D convolution window. Single int means the value if for both height and width of
+            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
@@ -141,13 +141,13 @@ class Conv2d(_Conv):
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

-            - same: Adopts the way of completion. Output height and width will be the same as the input.
-              Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible. Otherwise, the
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output will be return
+            - valid: Adopts the way of discarding. The possibly largest height and width of output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.

@@ -155,9 +155,9 @@ class Conv2d(_Conv):
              Tensor borders. `padding` should be greater than or equal to 0.

        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
-                    the padding of top, bottom, left and right is same, equal to padding. If `padding` is tuple with
-                    four integer, the padding of top, bottom, left and right equal to padding[0], padding[1],
-                    padding[2], padding[3] with corresponding. Default: 0.
+                    the padding of top, bottom, left and right is the same, equal to padding. If `padding` is a tuple
+                    with four integers, the padding of top, bottom, left and right will be equal to padding[0],
+                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
        dilation (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
@@ -167,7 +167,7 @@ class Conv2d(_Conv):
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
-            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
+            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -274,10 +274,10 @@ class Conv1d(_Conv):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

-    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
+    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
+    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
-    of kernel and it has shape :math:`(\text{ks_w})`, where :math:`\text{ks_w}` are width of the convolution kernel.
+    of kernel and it has shape :math:`(\text{ks_w})`, where :math:`\text{ks_w}` is the width of the convolution kernel.
    The full kernel has shape :math:`(C_{out}, C_{in} // \text{group}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -285,8 +285,8 @@ class Conv1d(_Conv):
    :math:`\left \lfloor{1 + \frac{W_{in} + 2 \times \text{padding} - \text{ks_w} -
    (\text{ks_w} - 1) \times (\text{dilation} - 1) }{\text{stride}}} \right \rfloor` respectively.

-    The first introduction can be found in paper `Gradient Based Learning Applied to Document Recognition
-    <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.
+    The first introduction of convolution layer can be found in paper `Gradient Based Learning Applied to Document
+    Recognition <http://vision.stanford.edu/cs598_spring07/papers/Lecun98.pdf>`_.

    Args:
        in_channels (int): The number of input channel :math:`C_{in}`.
@@ -298,13 +298,13 @@ class Conv1d(_Conv):
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

-            - same: Adopts the way of completion. Output width will be the same as the input.
-              Total number of padding will be calculated for horizontal
+            - same: Adopts the way of completion. The output width will be the same as the input.
+              The total number of padding will be calculated in the horizontal
              direction and evenly distributed to left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

-            - valid: Adopts the way of discarding. The possibly largest width of output will be return
+            - valid: Adopts the way of discarding. The possible largest width of the output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.

@@ -320,8 +320,8 @@ class Conv1d(_Conv):
        group (int): Split filter into groups, `in_ channels` and `out_channels` should be
            divisible by the number of groups. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
-        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
-            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
+        weight_init (Union[Tensor, str, Initializer, numbers.Number]): An initializer for the convolution kernel.
+            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -443,8 +443,8 @@ class Conv2dTranspose(_Conv):
    r"""
    2D transposed convolution layer.

-    Compute a 2D transposed convolution, which is also know as a deconvolution
-    (although it is not actual deconvolution).
+    Compute a 2D transposed convolution, which is also known as a deconvolution
+    (although it is not an actual deconvolution).

    Input is typically of shape :math:`(N, C, H, W)`, where :math:`N` is batch size and :math:`C` is channel number.

@@ -452,7 +452,7 @@ class Conv2dTranspose(_Conv):
        in_channels (int): The number of channels in the input space.
        out_channels (int): The number of channels in the output space.
        kernel_size (Union[int, tuple]): int or tuple with 2 integers, which specifies the  height
-            and width of the 2D convolution window. Single int means the value is for both height and width of
+            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
@@ -467,19 +467,19 @@ class Conv2dTranspose(_Conv):

            - valid: Adopted the way of discarding.
        padding (Union[int, tuple[int]]): Implicit paddings on both sides of the input. If `padding` is one integer,
-                    the padding of top, bottom, left and right is same, equal to padding. If `padding` is tuple with
-                    four integer, the padding of top, bottom, left and right equal to padding[0], padding[1],
-                    padding[2], padding[3] with corresponding. Default: 0.
-        dilation (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the dilation rate
+                    the padding of top, bottom, left and right is the same, equal to padding. If `padding` is a tuple
+                    with four integers, the padding of top, bottom, left and right will be equal to padding[0],
+                    padding[1], padding[2], and padding[3] accordingly. Default: 0.
+        dilation (Union[int, tuple[int]]): The data type is int or a tuple of 2 integers. Specifies the dilation rate
                                      to use for dilated convolution. If set to be :math:`k > 1`, there will
                                      be :math:`k - 1` pixels skipped for each sampling location. Its value should
-                                      be greater or equal to 1 and bounded by the height and width of the
+                                      be greater than or equal to 1 and bounded by the height and width of the
                                      input. Default: 1.
        group (int): Split filter into groups, `in_channels` and `out_channels` should be
-            divisible by the number of groups. This is not support for Davinci devices when group > 1. Default: 1.
+            divisible by the number of groups. This does not support for Davinci devices when group > 1. Default: 1.
        has_bias (bool): Specifies whether the layer uses a bias vector. Default: False.
        weight_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the convolution kernel.
-            It can be a Tensor, a string, an Initializer or a numbers.Number. When a string is specified,
+            It can be a Tensor, a string, an Initializer or a number. When a string is specified,
            values from 'TruncatedNormal', 'Normal', 'Uniform', 'HeUniform' and 'XavierUniform' distributions as well
            as constant 'One' and 'Zero' distributions are possible. Alias 'xavier_uniform', 'he_uniform', 'ones'
            and 'zeros' are acceptable. Uppercase and lowercase are both acceptable. Refer to the values of
@@ -614,8 +614,8 @@ class Conv1dTranspose(_Conv):
    r"""
    1D transposed convolution layer.

-    Compute a 1D transposed convolution, which is also know as a deconvolution
-    (although it is not actual deconvolution).
+    Compute a 1D transposed convolution, which is also known as a deconvolution
+    (although it is not an actual deconvolution).

    Input is typically of shape :math:`(N, C, W)`, where :math:`N` is batch size and :math:`C` is channel number.

@@ -805,11 +805,11 @@ class DepthwiseConv2d(Cell):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

-    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
+    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
+    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
-    :math:`\text{ks_w}` are height and width of the convolution kernel. The full kernel has shape
+    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -826,7 +826,7 @@ class DepthwiseConv2d(Cell):
        in_channels (int): The number of input channel :math:`C_{in}`.
        out_channels (int): The number of output channel :math:`C_{out}`.
        kernel_size (Union[int, tuple[int]]): The data type is int or tuple with 2 integers. Specifies the height
-            and width of the 2D convolution window. Single int means the value if for both height and width of
+            and width of the 2D convolution window. Single int means the value is for both the height and the width of
            the kernel. A tuple of 2 ints means the first value is for the height and the other is for the
            width of the kernel.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
@@ -835,13 +835,13 @@ class DepthwiseConv2d(Cell):
        pad_mode (str): Specifies padding mode. The optional values are
            "same", "valid", "pad". Default: "same".

-            - same: Adopts the way of completion. Output height and width will be the same as the input.
-              Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible. Otherwise, the
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible. Otherwise, the
              last extra padding will be done from the bottom and the right side. If this mode is set, `padding`
              must be 0.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output will be return
+            - valid: Adopts the way of discarding. The possibly largest height and width of output will be returned
              without padding. Extra pixels will be discarded. If this mode is set, `padding`
              must be 0.


--- a/mindspore/nn/layer/normalization.py
+++ b/mindspore/nn/layer/normalization.py
@@ -248,7 +248,7 @@ class BatchNorm1d(_BatchNorm):
        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
        momentum (float): A floating hyperparameter of the momentum for the
            running_mean and running_var computation. Default: 0.9.
-        affine (bool): A bool value when set to True, gamma and beta can be learnable. Default: True.
+        affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
@@ -262,9 +262,9 @@ class BatchNorm1d(_BatchNorm):
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
-            use the mean value and variance value of specified value. If None, training process will use the mean and
-            variance of current batch data and track the running mean and variance, eval process will use the running
-            mean and variance. Default: None.
+            use the mean value and variance value of specified value. If None, the training process will use the mean
+            and variance of current batch data and track the running mean and variance, the evaluation process will use
+            the running mean and variance. Default: None.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in})`.
@@ -324,7 +324,7 @@ class BatchNorm2d(_BatchNorm):
        eps (float): A value added to the denominator for numerical stability. Default: 1e-5.
        momentum (float): A floating hyperparameter of the momentum for the
            running_mean and running_var computation. Default: 0.9.
-        affine (bool): A bool value when set to True, gamma and beta can be learnable. Default: True.
+        affine (bool): A bool value. When set to True, gamma and beta can be learned. Default: True.
        gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
@@ -338,9 +338,9 @@ class BatchNorm2d(_BatchNorm):
            The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
            'he_uniform', etc. Default: 'ones'.
        use_batch_statistics (bool): If true, use the mean value and variance value of current batch data. If false,
-            use the mean value and variance value of specified value. If None, training process will use the mean and
-            variance of current batch data and track the running mean and variance, eval process will use the running
-            mean and variance. Default: None.
+            use the mean value and variance value of specified value. If None, the training process will use the mean
+            and variance of current batch data and track the running mean and variance, the evaluation process will use
+            the running mean and variance. Default: None.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

--- a/mindspore/nn/layer/pooling.py
+++ b/mindspore/nn/layer/pooling.py
@@ -84,16 +84,16 @@ class MaxPool2d(_PoolNd):
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        pad_mode (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        pad_mode (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -158,23 +158,23 @@ class AvgPool2d(_PoolNd):
        pad_mode for training only supports "same" and "valid".

    Args:
-        kernel_size (Union[int, tuple[int]]): The size of kernel used to take the average value,
-            is an int number that represents height and width are both kernel_size,
+        kernel_size (Union[int, tuple[int]]): The size of kernel used to take the average value.
+            The data type of kernel_size should be int and the value represents the height and width,
            or a tuple of two int numbers that represent height and width respectively.
            Default: 1.
        stride (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        pad_mode (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        pad_mode (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.


    Inputs:
@@ -238,16 +238,16 @@ class AvgPool1d(_PoolNd):
        kernel_size (int): The size of kernel window used to take the average value, Default: 1.
        stride (int): The distance of kernel moving, an int number that represents
            the width of movement is strides, Default: 1.
-        pad_mode (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        pad_mode (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.


    Inputs:

--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
--- a/mindspore/nn/loss/loss.py
+++ b/mindspore/nn/loss/loss.py
@@ -353,17 +353,18 @@ class CosineEmbeddingLoss(_Loss):

    Args:
        margin (float): Should be in [-1.0, 1.0]. Default 0.0.
-        reduction (str): Specifies which reduction to apply to the output. It should be one of
-          "none", "mean", "sum", meaning no reduction, reduce mean or sum on output, respectively. Default "mean".
+        reduction (str): Specifies which reduction to be applied to the output. It should be one of
+          "none", "mean", and "sum", meaning no reduction, reduce mean and sum on output, respectively. Default "mean".

    Inputs:
        - **input_x1** (Tensor) - Input tensor.
        - **input_x2** (Tensor) - Its shape and data type should be the same as `input_x1`'s shape and data type.
-        - **y** (Tensor) - Contains value 1 or -1. Suppose `input_x1` shape is
-          :math:`(x_1, x_2, x_3,..., x_R)`, then `target` shape should be :math:`(x_1, x_3, x_4, ..., x_R)`.
+        - **y** (Tensor) - Contains value 1 or -1. Suppose the shape of `input_x1` is
+          :math:`(x_1, x_2, x_3,..., x_R)`, then the shape of `target` should be :math:`(x_1, x_3, x_4, ..., x_R)`.

    Outputs:
-        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, loss value otherwise.
+        - **loss** (Tensor) - If `reduction` is "none", its shape is the same as `y`'s shape, otherwise a scalar value
+          will be returned.

    Examples:
        >>> x1 = Tensor(np.array([[0.3, 0.8], [0.4, 0.3]]), mindspore.float32)

--- a/mindspore/nn/metrics/accuracy.py
+++ b/mindspore/nn/metrics/accuracy.py
@@ -21,9 +21,9 @@ class Accuracy(EvaluationBase):
    r"""
    Calculates the accuracy for classification and multilabel data.

-    The accuracy class creates two local variables, correct number and total number that are used to compute the
+    The accuracy class creates two local variables, the correct number and the total number that are used to compute the
    frequency with which predictions matches labels. This frequency is ultimately returned as the accuracy: an
-    idempotent operation that simply divides correct number by total number.
+    idempotent operation that simply divides the correct number by the total number.

    .. math::
        \text{accuracy} =\frac{\text{true_positive} + \text{true_negative}}
@@ -58,17 +58,17 @@ class Accuracy(EvaluationBase):

        Args:
            inputs: Input `y_pred` and `y`. `y_pred` and `y` are a `Tensor`, a list or an array.
-                For 'classification' evaluation type, `y_pred` is in most cases (not strictly) a list
+                For the 'classification' evaluation type, `y_pred` is in most cases (not strictly) a list
                of floating numbers in range :math:`[0, 1]`
                and the shape is :math:`(N, C)`, where :math:`N` is the number of cases and :math:`C`
                is the number of categories. Shape of `y` can be :math:`(N, C)` with values 0 and 1 if one-hot
                encoding is used or the shape is :math:`(N,)` with integer values if index of category is used.
                For 'multilabel' evaluation type, `y_pred` and `y` can only be one-hot encoding with
-                values 0 or 1. Indices with 1 indicate positive category. The shape of `y_pred` and `y`
+                values 0 or 1. Indices with 1 indicate the positive category. The shape of `y_pred` and `y`
                are both :math:`(N, C)`.

        Raises:
-            ValueError: If the number of the input is not 2.
+            ValueError: If the number of the inputs is not 2.
        """
        if len(inputs) != 2:
            raise ValueError('Accuracy need 2 inputs (y_pred, y), but got {}'.format(len(inputs)))

--- a/mindspore/nn/optim/adam.py
+++ b/mindspore/nn/optim/adam.py
@@ -132,7 +132,7 @@ def _check_param_value(beta1, beta2, eps, prim_name):

 class Adam(Optimizer):
    r"""
-    Updates gradients by Adaptive Moment Estimation (Adam) algorithm.
+    Updates gradients by the Adaptive Moment Estimation (Adam) algorithm.

    The Adam algorithm is proposed in `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_.

@@ -157,9 +157,9 @@ class Adam(Optimizer):
        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.

-        To improve parameter groups performance, the customized order of parameters can be supported.
+        To improve parameter groups performance, the customized order of parameters is supported.

-        The sparse strategy is applied while the SparseGatherV2 operator being used for forward network.
+        The sparse strategy is applied while the SparseGatherV2 operator is used for forward network.
        The sparse feature is under continuous development. The sparse
        behavior is currently performed on the CPU.

@@ -170,36 +170,36 @@ class Adam(Optimizer):

            - params: Required. The value should be a list of `Parameter`.

-            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.

-            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
-              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
+              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
+              which in the 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or graph for the learning rate.
-            When the learning_rate is a Iterable or a Tensor with dimension of 1, use dynamic learning rate, then
+            When the learning_rate is a Iterable or a Tensor with dimension of 1, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor with
            dimension of 0, use fixed learning rate. Other cases are not supported. The float learning rate should be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
-        beta1 (float): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). Default:
-                       0.9.
-        beta2 (float): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). Default:
-                       0.999.
+        beta1 (float): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
+                       Default: 0.9.
+        beta2 (float): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0).
+                       Default: 0.999.
        eps (float): Term added to the denominator to improve numerical stability. Should be greater than 0. Default:
                     1e-8.
        use_locking (bool): Whether to enable a lock to protect updating variable tensors.
            If True, updating of the var, m, and v tensors will be protected by a lock.
            If False, the result is unpredictable. Default: False.
        use_nesterov (bool): Whether to use Nesterov Accelerated Gradient (NAG) algorithm to update the gradients.
-            If True, updates the gradients using NAG.
-            If False, updates the gradients without using NAG. Default: False.
+            If True, update the gradients using NAG.
+            If False, update the gradients without using NAG. Default: False.
        weight_decay (float): Weight decay (L2 penalty). It should be equal to or greater than 0. Default: 0.0.
        loss_scale (float): A floating point value for the loss scale. Should be greater than 0. Default: 1.0.

@@ -278,11 +278,11 @@ class Adam(Optimizer):

 class AdamWeightDecay(Optimizer):
    """
-    Implements Adam algorithm weight decay fix.
+    Implements the Adam algorithm to fix the weight decay.

    Note:
        When separating parameter groups, the weight decay in each group will be applied on the parameters if the
-        weight decay is posigive. When not separating parameter groups, the `weight_decay` in the API will be applied
+        weight decay is positive. When not separating parameter groups, the `weight_decay` in the API will be applied
        on the parameters without 'beta' or 'gamma' in their names if `weight_decay` is positive.

        To improve parameter groups performance, the customized order of parameters can be supported.
@@ -294,27 +294,27 @@ class AdamWeightDecay(Optimizer):

            - params: Required. The value should be a list of `Parameter`.

-            - lr: Optional. If "lr" in the keys, the value of corresponding learning rate will be used.
+            - lr: Optional. If "lr" is in the keys, the value of the corresponding learning rate will be used.
              If not, the `learning_rate` in the API will be used.

-            - weight_decay: Optional. If "weight_decay" in the keys, the value of corresponding weight decay
+            - weight_decay: Optional. If "weight_decay" is in the keys, the value of the corresponding weight decay
              will be used. If not, the `weight_decay` in the API will be used.

-            - order_params: Optional. If "order_params" in the keys, the value should be the order of parameters and
-              the order will be followed in optimizer. There are no other keys in the `dict` and the parameters which
-              in the value of 'order_params' should be in one of group parameters.
+            - order_params: Optional. If "order_params" is in the keys, the value should be the order of parameters and
+              the order will be followed in the optimizer. There are no other keys in the `dict` and the parameters
+              which in the 'order_params' should be in one of group parameters.

        learning_rate (Union[float, Tensor, Iterable, LearningRateSchedule]): A value or graph for the learning rate.
-            When the learning_rate is a Iterable or a Tensor with dimension of 1, use dynamic learning rate, then
+            When the learning_rate is a Iterable or a Tensor with dimension of 1, use the dynamic learning rate, then
            the i-th step will take the i-th value as the learning rate. When the learning_rate is LearningRateSchedule,
            use dynamic learning rate, the i-th learning rate will be calculated during the process of training
            according to the formula of LearningRateSchedule. When the learning_rate is a float or a Tensor with
            dimension of 0, use fixed learning rate. Other cases are not supported. The float learning rate should be
            equal to or greater than 0. If the type of `learning_rate` is int, it will be converted to float.
            Default: 1e-3.
-        beta1 (float): The exponential decay rate for the 1st moment estimates. Default: 0.9.
+        beta1 (float): The exponential decay rate for the 1st moment estimations. Default: 0.9.
            Should be in range (0.0, 1.0).
-        beta2 (float): The exponential decay rate for the 2nd moment estimates. Default: 0.999.
+        beta2 (float): The exponential decay rate for the 2nd moment estimations. Default: 0.999.
            Should be in range (0.0, 1.0).
        eps (float): Term added to the denominator to improve numerical stability. Default: 1e-6.
            Should be greater than 0.

--- a/mindspore/nn/wrap/cell_wrapper.py
+++ b/mindspore/nn/wrap/cell_wrapper.py
@@ -201,8 +201,8 @@ class DataWrapper(Cell):

    Args:
        network (Cell): The training network for dataset.
-        dataset_types (list): The type of dataset. The list contains describes the types of the inputs.
-        dataset_shapes (list): The shapes of dataset. The list contains multiple sublists that describes
+        dataset_types (list): The type of dataset. The list contains the types of the inputs.
+        dataset_shapes (list): The shapes of dataset. The list contains multiple sublists that describe
            the shape of the inputs.
        queue_name (str): The identification of dataset channel which specifies the dataset channel to supply
            data for the network.

--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@@ -663,16 +663,16 @@ class MaxPoolGradGrad(_PoolGrad):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **origin_input** (Tensor) - Tensor with data format "NCHW", data type should be float16.
@@ -736,16 +736,16 @@ class MaxPoolGradGradWithArgmax(_PoolGrad):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **x** (Tensor) - Tensor with data format "NCHW", data type should be float16.

--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@@ -756,11 +756,11 @@ class Conv2D(PrimitiveWithInfer):

        out_j = \sum_{i=0}^{C_{in} - 1} ccor(W_{ij}, X_i) + b_j,

-    where :math:`ccor` is cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
-    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to :math:`i`-th channel of the :math:`j`-th
+    where :math:`ccor` is the cross correlation operator, :math:`C_{in}` is the input channel number, :math:`j` ranges
+    from :math:`0` to :math:`C_{out} - 1`, :math:`W_{ij}` corresponds to the :math:`i`-th channel of the :math:`j`-th
    filter and :math:`out_{j}` corresponds to the :math:`j`-th channel of the output. :math:`W_{ij}` is a slice
    of kernel and it has shape :math:`(\text{ks_h}, \text{ks_w})`, where :math:`\text{ks_h}` and
-    :math:`\text{ks_w}` are height and width of the convolution kernel. The full kernel has shape
+    :math:`\text{ks_w}` are the height and width of the convolution kernel. The full kernel has shape
    :math:`(C_{out}, C_{in} // \text{group}, \text{ks_h}, \text{ks_w})`, where group is the group number
    to split the input in the channel dimension.

@@ -1029,7 +1029,7 @@ class _Pool(PrimitiveWithInfer):
           of two `int` for height and width. Default: 1.
        strides (Union[int, tuple[int]]): The stride of the window, that should be
            a tuple of two `int` for height and width. Default: 1.
-        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".
    """

@@ -1104,16 +1104,16 @@ class MaxPool(_Pool):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.
@@ -1151,16 +1151,16 @@ class MaxPoolWithArgmax(_Pool):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.


    Inputs:
@@ -1233,16 +1233,16 @@ class AvgPool(_Pool):
        strides (Union[int, tuple[int]]): The distance of kernel moving, an int number that represents
            the height and width of movement are both strides, or a tuple of two int numbers that
            represent height and width of movement respectively. Default: 1.
-        padding (str): The optional values for pad mode, is "same" or "valid", not case sensitive.
+        padding (str): The optional value for pad mode, is "same" or "valid", not case sensitive.
            Default: "valid".

-            - same: Adopts the way of completion. Output height and width will be the same as
-              the input. Total number of padding will be calculated for horizontal and vertical
-              direction and evenly distributed to top and bottom, left and right if possible.
+            - same: Adopts the way of completion. The height and width of the output will be the same as
+              the input. The total number of padding will be calculated in horizontal and vertical
+              directions and evenly distributed to top and bottom, left and right if possible.
              Otherwise, the last extra padding will be done from the bottom and the right side.

-            - valid: Adopts the way of discarding. The possibly largest height and width of output
-              will be return without padding. Extra pixels will be discarded.
+            - valid: Adopts the way of discarding. The possible largest height and width of output
+              will be returned without padding. Extra pixels will be discarded.

    Inputs:
        - **input** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.