[Docs] Modified the docs of some api for supporting list/tuple args. (#32360)

* fixed docs. * Fixed docs. test=document_fix code bak. fixed docs. test=document_fix * Revert to previous version of python/paddle/fluid/backward.py * fixed bugs. * test=document_fix. Fixed examples.

[Docs] Modified the docs of some api for supporting list/tuple args. (#32360)
* fixed docs. * Fixed docs. test=document_fix code bak. fixed docs. test=document_fix * Revert to previous version of python/paddle/fluid/backward.py * fixed bugs. * test=document_fix. Fixed examples.
15158927 · xiemoyuan · GitHub · 23d3e36a · 15158927 · 15158927
34 changed file
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -28,10 +28,10 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
    Args:
        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
-        custom_white_list(set|list, optional): The custom white_list. It's the set of ops that support
+        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
             will be converted to fp16.
-        custom_black_list(set|list, optional): The custom black_list. The set of ops that support fp16
+        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
             calculation and are considered numerically-dangerous and whose effects may also be 
             observed in downstream ops. These ops will not be converted to fp16.

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -662,7 +662,7 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
    Args:
        tensor (Tensor): The output Tensor. Its data type
            should be float16, float32, float64, int32 or int64.
-        tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
+        tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
            should be float16, float32, float64, int32 or int64. Default value is None.
        src (int): The source rank id. Default value is 0.
        group (Group): The group instance return by new_group or None for global default group.
@@ -679,6 +679,8 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
            import paddle
            from paddle.distributed import init_parallel_env
+            # required: gpu
            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
            init_parallel_env()
            if paddle.distributed.ParallelEnv().local_rank == 0:

--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -24,7 +24,7 @@ def wait_server_ready(endpoints):
    port readiness.
    Args:
-    endpoints (list): endpoints string list, like:
+    endpoints (list|tuple): endpoints string list, like:
    ["127.0.0.1:8080", "127.0.0.1:8081"]
    Examples:

--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -325,7 +325,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
        func (function): The target function is called by spawned process.
            This function need to be able to pickled, so it must be defined
            at the top level of a module.
-        args (tuple, optional): Arguments passed to ``func``.
+        args (list|tuple, optional): Arguments passed to ``func``.
        nprocs (int, optional): Number of processed to start. Default: -1.
            when nprocs is -1, the available device will be obtained from 
            the environment variable when the model is executed: If use GPU, 

--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -233,7 +233,7 @@ class TensorDataset(Dataset):
    each sample by indexing tensors in the 1st dimension.
    Args:
-        tensors(list of Tensor): tensors with same shape in the 1st dimension.
+        tensors(list|tuple): A list/tuple of tensors with same shape in the 1st dimension.
    Returns:
        Dataset: a Dataset instance wrapping tensors.

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -418,7 +418,7 @@ def cuda_places(device_ids=None):
    [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
    Parameters:
-        device_ids (list or tuple of int, optional): list of GPU device ids.
+        device_ids (list|tuple, optional): A list/tuple of int of GPU device ids.
    Returns:
        list of paddle.CUDAPlace: Created GPU place list.
@@ -429,6 +429,8 @@ def cuda_places(device_ids=None):
            import paddle
            import paddle.static as static
+            # required: gpu
            paddle.enable_static()
            cuda_places = static.cuda_places()

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1913,7 +1913,7 @@ def load(program, model_path, executor=None, var_list=None):
        model_path(str): The file prefix store the program
        executor(Executor, optional): The executor used for initialize the parameter
                                      When startup program is not run.
-        var_list(list, optional): The Tensor list to load single model file saved with
+        var_list(list|tuple, optional): The Tensor list/tuple to load single model file saved with
                                  [ save_params, save_persistables, save_vars ].
                                  Default: None
@@ -2103,7 +2103,7 @@ def load_program_state(model_path, var_list=None):
    Args:
        model_path(str): The file prefix store the program
-        var_list(list, optional): The Tensor list to load saved with
+        var_list(list|tuple, optional): The Tensor list/tuple to load saved with
                                  [ save_params, save_persistables, save_vars ].
                                  Default: None.
                                  The var_list is only used to get name,

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -148,7 +148,7 @@ def create_global_var(shape,
    This function creates a new tensor variable with value in the global block(block 0).
    Parameters:
-        shape (list of int): Shape of the variable
+        shape (list[int]|tuple[int]): Shape of the variable
        value (float): The value of the variable. The new created
                      variable will be filled with it.
        dtype (str): Data type of the variable

--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -81,7 +81,7 @@ def set_cuda_rng_state(state_list):
    Sets generator state for all cuda generators
    Args:
-        state_list(list): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
+        state_list(list|tuple): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
    Returns:
        None

--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,7 @@ class Accuracy(Metric):
    Encapsulates accuracy metric logic.
    Args:
-        topk (int|tuple(int)): Number of top elements to look at
+        topk (int|list[int]|tuple[int]): Number of top elements to look at
            for computing accuracy. Default is (1,).
        name (str, optional): String name of the metric instance. Default
            is `acc`.

--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -207,7 +207,7 @@ def interpolate(x,
        size (list|tuple|Tensor|None): Output shape of image resize
             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
             If a Tensor, its dimensions size should be a 1.
        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
             least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -638,7 +638,7 @@ def upsample(x,
        size (list|tuple|Tensor|None): Output shape of image resize
             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
             If a Tensor , its dimensions size should be a 1.
        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
             least one of :attr:`size` or :attr:`scale_factor` must be set.

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -218,7 +218,7 @@ def conv1d(x,
        weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
            the number of output channels, g is the number of groups, K is the kernel's size. 
        bias (Tensor, optional): The bias with shape [M,]. Default: None.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
            contain one integers, (stride_size). Default: 1.
        padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
            1. a string in ['valid', 'same'].
@@ -227,7 +227,7 @@ def conv1d(x,
            4. a list[int] or tuple[int] whose length is 2. It has the form  [pad_before, pad_after].
            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
            The default value is 0.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain one integer, (dilation_size). Default: 1.
        groups (int, optional): The groups number of the conv1d function. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -250,7 +250,7 @@ def conv1d(x,
        ValueError: If the channel dimension of the input is less than or equal to zero.
        ValueError: If `data_format` is not "NCL" or "NLC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
        ShapeError: If the input is not 3-D Tensor.
        ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -451,8 +451,8 @@ def conv2d(x,
            the number of output channels, g is the number of groups, kH is the filter's
            height, kW is the filter's width. 
        bias (Tensor, optional): The bias with shape [M,].
-        stride (int|tuple): The stride size. It means the stride in convolution. 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
            on both sides for each dimension.If `padding` is a string, either 'VALID' or
@@ -464,8 +464,8 @@ def conv2d(x,
            when `data_format` is `"NHWC"`, `padding` can be in the form
            `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel
-            points. If dilation is a tuple, it must contain two integers, (dilation_height, 
+            points. If dilation is a list/tuple, it must contain two integers, (dilation_height, 
            dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
            Default: dilation = 1.
        groups (int): The groups number of the Conv2D Layer. According to grouped
@@ -488,7 +488,7 @@ def conv2d(x,
        ValueError: If `data_format` is not "NCHW" or "NHWC".
        ValueError: If the channel dimension of the input is less than or equal to zero.
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
        ShapeError: If the input is not 4-D Tensor.
        ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -637,7 +637,7 @@ def conv1d_transpose(x,
            K is the size of the kernel.
        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain one integer, `(stride_size)`.
+            If stride is a list/tuple, it must contain one integer, `(stride_size)`.
            Default: stride = 1.
        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -645,7 +645,7 @@ def conv1d_transpose(x,
             If `padding` is a tuple or list, it could be in two forms:
             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
-             If it is a tuple, it must contain one integer. Default: 0.
+             If it is a list/tuple, it must contain one integer. Default: 0.
        groups(int, optional): The groups number of the conv1d transpose function. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
@@ -653,10 +653,10 @@ def conv1d_transpose(x,
            filters is only connected to the second half of the input channels.
            Default: groups = 1.
        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain one integer, `(dilation_size)`.
+            If dilation is a list/tuple, it must contain one integer, `(dilation_size)`.
            Default: dilation = 1.
        output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain one integer, `(feature_length)`. None if use
+            tuple/list, it must contain one integer, `(feature_length)`. None if use
            filter_size(shape of weight), padding, and stride to calculate output_size.
        data_format (str, optional): Specify the data format of the input, and the data format of the output 
            will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
@@ -675,7 +675,7 @@ def conv1d_transpose(x,
    Raises:
        ValueError: If `data_format` is a string, but not "NCL" or "NLC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
        ValueError: If `output_size` and filter_size are None at the same time.
        ValueError: If `output_padding` is greater than `stride`.
@@ -900,7 +900,7 @@ def conv2d_transpose(x,
            kH is the height of the kernel, and kW is the width of the kernel.
        bias(Tensor, optional): The bias, a Tensor with shape [M, ].
        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
        padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
            on both sides for each dimension. If `padding` is a string, either 'VALID' or 
@@ -921,10 +921,10 @@ def conv2d_transpose(x,
            filters is only connected to the second half of the input channels.
            Default: groups = 1.
        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            If dilation is a list/tuple, it must contain two integers, (dilation_height, dilation_width). 
            Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
        output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
+            tuple/list, it must contain two integers, (image_height, image_width). None if use
            filter_size(shape of weight), padding, and stride to calculate output_size.
        data_format (str, optional): Specify the data format of the input, and the data format of the output 
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
@@ -943,7 +943,7 @@ def conv2d_transpose(x,
    Raises:
        ValueError: If `data_format` is not "NCHW" or "NHWC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
        ValueError: If `output_size` and kernel_size are None at the same time.
        ShapeError: If the input is not 4-D Tensor.
@@ -1120,8 +1120,8 @@ def conv3d(x,
            where M is the number of filters(output channels), g is the number of groups,
            kD, kH, kW are the filter's depth, height and width respectively.
        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
            Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
            on both sides for each dimension. If `padding` is a string, either 'VALID' or
@@ -1133,8 +1133,8 @@ def conv3d(x,
            when `data_format` is `"NDHWC"`, `padding` can be in the form
            `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
            Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
            Default: dilation = 1.
        groups (int): The groups number of the Conv3D Layer. According to grouped
@@ -1292,7 +1292,7 @@ def conv3d_transpose(x,
            kD, kH, kW are the filter's depth, height and width respectively.
        bias (Tensor, optional): The bias, a Tensor of shape [M, ].
        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
            Default: stride = 1.
        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
@@ -1314,11 +1314,11 @@ def conv3d_transpose(x,
            filters is only connected to the second half of the input channels.
            Default: groups=1
        dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height, 
            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
            Default: dilation = 1.
        output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width).
+            list/tuple, it must contain three integers, (image_depth, image_height, image_width).
            None if use filter_size(shape of weight), padding, and stride to calculate output_size.
        data_format (str, optional): Specify the data format of the input, and the data format of the output 
            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
@@ -1338,7 +1338,7 @@ def conv3d_transpose(x,
    Raises:
        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
            or the element corresponding to the input's channel is not 0.
        ValueError: If `output_size` and kernel_size are None at the same time.
        ShapeError: If the input is not 5-D Tensor.

--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -300,7 +300,7 @@ class Upsample(layers.Layer):
        size (list|tuple|Tensor|None): Output shape of image resize
             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
             If a Tensor , its dimensions size should be a 1.
        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
             least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -419,7 +419,7 @@ class UpsamplingNearest2D(layers.Layer):
                          its data format is specified by :attr:`data_format`.
        size (list|tuple|Tensor|None): Output shape of image resize
             layer, the shape is (out_h, out_w) when input is a 4-D Tensor.
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
             If a Tensor , its dimensions size should be a 1.
        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
             least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -506,7 +506,7 @@ class UpsamplingBilinear2D(layers.Layer):
                          its data format is specified by :attr:`data_format`.
        size (list|tuple|Tensor|None): Output shape of image resize
             layer, the shape is (out_h, out_w) when input is a 4-D Tensor.
-             Default: None. If a list, each element can be an integer or a Tensor  of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor  of shape: [1].
             If a Tensor , its dimensions size should be a 1.
        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
             least one of :attr:`size` or :attr:`scale_factor` must be set.

--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -232,16 +232,16 @@ class Conv1D(_ConvNd):
        in_channels(int): The number of channels in the input image.
        out_channels(int): The number of filter. It is as same as the output
            feature map.
-        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple,
+        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple/list,
            it must contain one integer, (kernel_size).
-        stride (int|tuple|list, optional): The stride size. If stride is a tuple, it must
+        stride (int|tuple|list, optional): The stride size. If stride is a tuple/list, it must
            contain one integer, (stride_size). Default: 1.
        padding(int|str|tuple|list, optional): The size of zeros to be padded. It must be in one of the following forms.
            1. a string in ['valid', 'same'].
            2. an int, which means the feature map is zero paded by size of `padding` on both sides.
            3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
            The default value is 0.
-        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple/list, it must
            contain one integer, (dilation_size). Default: 1.
        groups (int, optional): The groups number of the conv2d Layer. According to grouped
            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -410,12 +410,12 @@ class Conv1DTranspose(_ConvNd):
        in_channels(int): The number of channels in the input image.
        out_channels(int): The number of the filter. It is as same as the output
            feature map.
-        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple,
+        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple/list,
            it must contain one integers, (kernel_size). None if
            use output size to calculate kernel_size. Default: None. kernel_size and
            output_size should not be None at the same time.
        stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain one integer, (stride_size).
+            If stride is a tuple/list, it must contain one integer, (stride_size).
            Default: stride = 1.
        padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
             `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -423,7 +423,7 @@ class Conv1DTranspose(_ConvNd):
             If `padding` is a tuple or list, it could be in two forms:
             `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
        output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
-             If it is a tuple, it must contain one integer. Default: 0.
+             If it is a tuple/list, it must contain one integer. Default: 0.
        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
            when group=2, the first half of the filters is only connected to the
@@ -432,7 +432,7 @@ class Conv1DTranspose(_ConvNd):
            Default: groups = 1.
        bias(bool, optional): Whether to use bias. Default: True.
        dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain one integer, (dilation_size).
+            If dilation is a tuple/list, it must contain one integer, (dilation_size).
            Default: dilation = 1.
        weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
            of conv1d_transpose. If it is set to None or one attribute of ParamAttr, conv1d_transpose
@@ -451,7 +451,7 @@ class Conv1DTranspose(_ConvNd):
    Shape:
        - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
-        - output_size(int|tuple|list, optional): The output image size. If output size is a tuple, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
+        - output_size(int|tuple|list, optional): The output image size. If output size is a tuple/list, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
        - output(Tensor): 3-D tensor with same shape as input x.
    Examples:
@@ -555,7 +555,7 @@ class Conv2D(_ConvNd):
        in_channels(int): The number of input channels in the input image.
        out_channels(int): The number of output channels produced by the convolution.
        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
            contain three integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. The default value is 1.
        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -565,7 +565,7 @@ class Conv2D(_ConvNd):
            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
            The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
        groups(int, optional): The groups number of the Conv3D Layer. According to grouped
@@ -710,10 +710,10 @@ class Conv2DTranspose(_ConvNd):
    Parameters:
        in_channels(int): The number of channels in the input image.
        out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
            it must contain two integers, (kernel_size_H, kernel_size_W).
            Otherwise, the kernel will be a square.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: 1.
        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -725,7 +725,7 @@ class Conv2DTranspose(_ConvNd):
            The default value is 0.
        output_padding(int|list|tuple, optional): Additional size added to one side
            of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: 1.
        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
@@ -866,7 +866,7 @@ class Conv3D(_ConvNd):
        in_channels(int): The number of input channels in the input image.
        out_channels(int): The number of output channels produced by the convolution.
        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
            stride_D = stride_H = stride_W = stride. The default value is 1.
        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -876,7 +876,7 @@ class Conv3D(_ConvNd):
            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
            The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
        groups(int, optional): The groups number of the Conv3D Layer. According to grouped
@@ -1037,11 +1037,11 @@ class Conv3DTranspose(_ConvNd):
    Parameters:
        in_channels(int): The number of channels in the input image.
        out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
            it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
            Otherwise, the kernel will be a square.
        stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
            stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
            The default value is 1.
        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -1053,7 +1053,7 @@ class Conv3DTranspose(_ConvNd):
            The default value is 0.
        output_padding(int|list|tuple, optional): Additional size added to one side
            of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
        groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
@@ -1071,11 +1071,6 @@ class Conv3DTranspose(_ConvNd):
            If it is set to None or one attribute of ParamAttr, conv3d_transpose
            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. The default value is None.
-        output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
        data_format(str, optional): Data format that specifies the layout of input.
            It can be "NCDHW" or "NDHWC". Default: "NCDHW".

--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -447,7 +447,7 @@ class LSTMCell(RNNCellBase):
    Inputs:
        - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_t` in the formula.
-        - **states** (tuple, optional): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
+        - **states** (list|tuple, optional): a list/tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
    Returns:
        - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
@@ -1251,7 +1251,7 @@ class LSTM(RNNBase):
    Inputs:
        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`.
-        - **initial_states** (tuple, optional): the initial state, a tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
+        - **initial_states** (list|tuple, optional): the initial state, a list/tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
        - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
    Returns:

--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -461,14 +461,14 @@ class TransformerEncoderLayer(Layer):
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
            Default: None, which means the default weight parameter property is used.
            See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
            The `False` value means the corresponding layer would not have trainable
@@ -747,16 +747,16 @@ class TransformerDecoderLayer(Layer):
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
            self attention, `weight_attr[1]` would be used as `weight_attr` for
            cross attention, and `weight_attr[2]` would be used as `weight_attr`
            for linear in FFN. Otherwise, the three sub-layers all uses it as
            `weight_attr` to create parameters. Default: None, which means the
            default weight parameter property is used. See usage for details
            in :ref:`api_paddle_fluid_param_attr_ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
            self attention, `bias_attr[1]` would be used as `bias_attr` for
            cross attention, and `bias_attr[2]` would be used as `bias_attr`
            for linear in FFN. Otherwise, the three sub-layers all uses it as
@@ -1129,8 +1129,8 @@ class Transformer(Layer):
            normalization and post-precess includes dropout, residual connection.
            Otherwise, no pre-process and post-precess includes dropout, residual
            connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
@@ -1142,8 +1142,8 @@ class Transformer(Layer):
            Default: None, which means the default weight parameter property is used. 
            See usage for details
            in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 

--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -36,20 +36,20 @@ class Adadelta(Optimizer):
        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
    Args:
-	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
        rho (float): a floating point value indicating the decay rate. Default 0.95.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization. 
+            Default None, meaning there is no regularization. 
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,

--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -43,16 +43,16 @@ class Adagrad(Optimizer):
            It can be a float value or a ``Variable`` with a float type.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-06.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
+            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
+            If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies, 
            ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 

--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -60,16 +60,16 @@ class Adam(Optimizer):
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,

--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -53,16 +53,16 @@ class Adamax(Optimizer):
            The default value is 0.999.
        epsilon (float, optional): A small float value for numerical stability.
            The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
            some derived class of ``GradientClipBase`` . There are three cliping strategies 
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 

--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -43,9 +43,9 @@ class AdamW(Adam):
    Args:
        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
            It can be a float value or a LRScheduler. The default value is 0.001.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static mode, at this time all parameters will be updated.
        beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
            It should be a float number or a Tensor with shape [1] and data type as float32.
            The default value is 0.9.

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -312,8 +312,8 @@ class PiecewiseDecay(LRScheduler):
            learning_rate = 0.1
    Args:
-        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
+        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int. 
-        values(list): A list of learning rate values that will be picked during different epoch boundaries. 
+        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries. 
            The type of element in the list is python float.
        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .

--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -49,16 +49,16 @@ class Momentum(Optimizer):
        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
        momentum (float): Momentum factor. The default value is 0.9.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization.
+            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,

--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -53,7 +53,7 @@ class Optimizer(object):
    Args:
        learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
            It can be a float value or any subclass of ``LRScheduler`` .
-        parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \

--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -78,16 +78,16 @@ class RMSProp(Optimizer):
            the gradient; if False, by the uncentered second moment. Setting this to
            True may help with training, but is slightly more expensive in terms of
            computation and memory. Defaults to False.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+            The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,

--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -30,16 +30,16 @@ class SGD(Optimizer):
    Parameters:
        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
            This parameter is required in dygraph mode. \
            The default value is None in static mode, at this time all parameters will be updated.
        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
+            It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
+            the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization.
+            Default None, meaning there is no regularization.
        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
            some derived class of ``GradientClipBase`` . There are three cliping strategies
            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,

--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -86,7 +86,7 @@ def fc(x,
        out.shape = (1, 2)
    Args:
-        x (Tensor|list of Tensor): A tensor or a list of tensor. The number of dimensions
+        x (Tensor|list[Tensor]|tuple[Tensor]): A tensor or a list/tuple of tensors. The number of dimensions
            of each tensor is at least 2. The data type should be float16, float32 or float64.
        size (int): The number of output units in this layer, which also means the feature
            size of output tensor.
@@ -233,16 +233,16 @@ def deform_conv2d(x,
            deformable convolution v1.
        num_filters(int): The number of filter. It is as same as the output
            image channel.
-        filter_size (int|tuple): The filter size. If filter_size is a tuple,
+        filter_size (int|list|tuple): The filter size. If filter_size is a list/tuple,
            it must contain two integers, (filter_size_H, filter_size_W).
            Otherwise, the filter will be a square.
-        stride (int|tuple, Optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, Optional): The stride size. If stride is a list/tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple, Optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, Optional): The padding size. If padding is a list/tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple, Optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, Optional): The dilation size. If dilation is a list/tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: dilation = 1.
        groups (int, Optional): The groups number of the deformable conv layer. According to

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -132,7 +132,7 @@ def flip(x, axis, name=None):
    Args:
        x (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor x
            should be float32, float64, int32, int64, bool.
-        axis (list): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
+        axis (list|tuple): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
            For more information, please refer to :ref:`api_guide_Name` .
@@ -545,7 +545,7 @@ def squeeze(x, axis=None, name=None):
    Args:
        x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
-        axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
+        axis (int|list|tuple, optional): An integer or list/tuple of integers, indicating the dimensions to be squeezed. Default is None.
                          The range of axis is :math:`[-ndim(x), ndim(x))`.
                          If axis is negative, :math:`axis = axis + ndim(x)`.
                          If axis is None, all the dimensions of x of size 1 will be removed.

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -752,7 +752,7 @@ def add_n(inputs, name=None):
                              [14, 16, 18]]
    Args:
-        inputs (Tensor|list(Tensor)):  A Tensor list. The shape and data type of the list elements should be consistent.
+        inputs (Tensor|list[Tensor]|tuple[Tensor]):  A Tensor or a list/tuple of Tensors. The shape and data type of the list/tuple elements should be consistent.
            Input can be multi-dimensional Tensor, and data types can be: float32, float64, int32, int64.
        name(str, optional): The default value is None. Normally there is no need for
            user to set this property. For more information, please refer to :ref:`api_guide_Name`
@@ -1082,7 +1082,7 @@ def max(x, axis=None, keepdim=False, name=None):
    Args:
        x(Tensor): A tensor, the data type is float32,
            float64, int32, int64.
-        axis(list|int, optional): The axis along which the maximum is computed.
+        axis(int|list|tuple, optional): The axis along which the maximum is computed.
            If :attr:`None`, compute the maximum over all elements of
            `x` and return a Tensor with a single element,
            otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
@@ -1174,7 +1174,7 @@ def min(x, axis=None, keepdim=False, name=None):
    Args:
        x(Tensor): A tensor, the data type is float32, float64, int32, int64.
-        axis(list|int, optional): The axis along which the minimum is computed.
+        axis(int|list|tuple, optional): The axis along which the minimum is computed.
            If :attr:`None`, compute the minimum over all elements of
            `x` and return a Tensor with a single element,
            otherwise must be in the range :math:`[-x.ndim, x.ndim)`.

--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -454,13 +454,13 @@ def deform_conv2d(x,
            the number of output channels, g is the number of groups, kH is the filter's
            height, kW is the filter's width.
        bias (Tensor, optional): The bias with shape [M,].
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain two integers, (dilation_H, dilation_W). Otherwise, the
            dilation_H = dilation_W = dilation. Default: dilation = 1.
        deformable_groups (int): The number of deformable group partitions.
@@ -644,13 +644,13 @@ class DeformConv2D(Layer):
        in_channels(int): The number of input channels in the input image.
        out_channels(int): The number of output channels produced by the convolution.
        kernel_size(int|list|tuple): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
            contain three integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride. The default value is 1.
-        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
            contain two integers, (padding_H, padding_W). Otherwise, the
            padding_H = padding_W = padding. Default: padding = 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
        deformable_groups (int): The number of deformable group partitions.

--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -153,8 +153,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
    Args:
        img (PIL.Image|np.array): Image to be padded.
        padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
            this is the padding for the left, top, right and bottom borders
            respectively.
        fill (float, optional): Pixel fill value for constant fill. If a tuple of

--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -136,8 +136,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
    Args:
        img (np.array): Image to be padded.
        padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
            this is the padding for the left, top, right and bottom borders
            respectively.
        fill (float, optional): Pixel fill value for constant fill. If a tuple of

--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -141,8 +141,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
    Args:
        img (PIL.Image): Image to be padded.
        padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
            this is the padding for the left, top, right and bottom borders
            respectively.
        fill (float, optional): Pixel fill value for constant fill. If a tuple of

--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -86,7 +86,7 @@ class Compose(object):
    together for a dataset transform.
    Args:
-        transforms (list): List of transforms to compose.
+        transforms (list|tuple): List/Tuple of transforms to compose.
    Returns:
        A compose object which is callable, __call__ for this Compose
@@ -608,8 +608,8 @@ class Normalize(BaseTransform):
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
    Args:
-        mean (int|float|list): Sequence of means for each channel.
+        mean (int|float|list|tuple): Sequence of means for each channel.
-        std (int|float|list): Sequence of standard deviations for each channel.
+        std (int|float|list|tuple): Sequence of standard deviations for each channel.
        data_format (str, optional): Data format of img, should be 'HWC' or 
            'CHW'. Default: 'CHW'.
        to_rgb (bool, optional): Whether to convert to rgb. Default: False.
@@ -1022,11 +1022,11 @@ class Pad(BaseTransform):
    Args:
        padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
            this is the padding for the left, top, right and bottom borders
            respectively.
-        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a list/tuple of
            length 3, it is used to fill R, G, B channels respectively.
            This value is only used when the padding_mode is constant
        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.