diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index da527b26bf0608da5a648d92b492ff27cf2802f0..35fce9e9d6ba9d7a2f264bdd5c1f3deb7a2a67e9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -44,7 +44,7 @@ class Conv1dTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.data_format = data_format
-        self.channel_last = (self.data_format == "NHWC")
+        self.channel_last = (self.data_format == "NLC")
 
         self.padding = padding
         self.padding_mode = padding_mode
@@ -147,6 +147,14 @@ class Conv1dErrorTestCase(Conv1dTestCase):
                 self.paddle_nn_layer()
 
 
+class Conv1dTypeErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(TypeError):
+                self.paddle_nn_layer()
+
+
 def add_cases(suite):
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
@@ -161,6 +169,7 @@ def add_cases(suite):
         Conv1dTestCase(
             methodName='runTest', padding=2, data_format='NLC'))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1, 2]))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(
@@ -178,7 +187,7 @@ def add_cases(suite):
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1dTypeErrorTestCase(
             methodName='runTest', padding_mode="reflect", padding="valid"))
     suite.addTest(
         Conv1dErrorTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 73227dd3610376d85fcfc70bb2653dfd927427fd..4c98aacd209dab8e5dc9e7744922a927700c4bb3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -201,6 +201,7 @@ def add_cases(suite):
         ConvTranspose1dTestCase(
             methodName='runTest', data_format="NLC", stride=3,
             output_padding=2))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[1, 2]))
 
 
 def add_error_cases(suite):
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 42d7d98aefcbbf51f562b98c4c494aeccfe20cf2..3c1482e69c3c36232ee5d70f2156a8d16c2d212a 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -232,7 +232,7 @@ def conv1d(x,
         raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
-    channel_last = (data_format == "NHWC")
+    channel_last = (data_format == "NLC")
     channel_dim = -1 if channel_last else 1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     num_channels = x.shape[channel_dim]
@@ -399,7 +399,7 @@ def conv2d(x,
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
             `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
             [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
@@ -733,20 +733,31 @@ def conv_transpose1d(x,
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
-    output_padding = utils.convert_to_list(output_padding, 1,
-                                           'output_padding') + [0]
-    if output_padding[0] > stride[0]:
-        raise ValueError(
-            "The size of output_padding should not be greater than stride."
-            "But got output_padding={} and stride={}".format(output_padding[0],
-                                                             stride[0]))
 
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 1,
+                                                'output_size') + [1]
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 1,
+                                               'output_padding') + [0]
+
+    if len(output_padding) > 0 and output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
@@ -761,16 +772,17 @@ def conv_transpose1d(x,
     weight = nn.unsqueeze(input=weight, axes=[-1])
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', conv2d_data_format)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
         out = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     else:
         inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -791,12 +803,6 @@ def conv_transpose1d(x,
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
-    if output_size is None:
-        out = pad2d(
-            out,
-            padding=[0, output_padding, 0, 0],
-            data_format=conv2d_data_format,
-            name=name)
     out = nn.squeeze(input=out, axes=[squeeze_axis])
     return out
 
@@ -888,9 +894,9 @@ def conv_transpose2d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or 
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            and when `data_format` is `"NCHW"`, `padding` can be in the form 
             `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
+            when `data_format` is `"NHWC"`, `padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
@@ -1116,9 +1122,9 @@ def conv3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
@@ -1340,9 +1346,9 @@ def conv_transpose3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4e342c00528a2c0115940bb7f695e1ed5b582382..f3985781adb6267780cc974cef7dc3fa8ae46b38 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -113,7 +113,7 @@ class _ConvNd(layers.Layer):
             attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
 
 
-class Conv1d(layers.Layer):
+class Conv1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``Conv1d`` class.
     For more details, refer to code examples.
@@ -172,8 +172,7 @@ class Conv1d(layers.Layer):
             When in 'replicate' mode, uses input boundaries to pad the input tensor.
             When in 'circular' mode, uses circular input to pad the input tensor.
             Default is 'zeros'.
-        bias(bool, optional): Whether to use bias. Default: True.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
@@ -218,196 +217,6 @@ class Conv1d(layers.Layer):
           #   [160. 211.]]]
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 bias=True,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCL",
-                 name=None):
-        super(Conv1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False here."
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._groups = groups
-        if in_channels % groups != 0:
-            raise ValueError("in_channels must be divisible by groups.")
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._padding = padding  # leave it to F.conv1d
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._data_format = data_format
-        self._name = name
-
-        self._padding_mode = padding_mode
-
-        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
-        if padding_mode not in valid_padding_modes:
-            raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".
-                format(valid_padding_modes, padding_mode))
-
-        if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, np.int):
-            raise ValueError(
-                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
-            )
-        if not isinstance(padding, str):
-            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
-
-        num_filter_channels = in_channels // groups
-        filter_shape = [self._out_channels, num_filter_channels
-                        ] + self._kernel_size
-
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=filter_shape,
-            default_initializer=_get_default_param_initializer(
-                self._in_channels, filter_shape))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if bias else None
-
-    def forward(self, x):
-        padding = 0
-        if self._padding_mode != "zeros":
-            x = F.pad(x,
-                      self._padding,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-        else:
-            padding = self._padding
-
-        out = F.conv1d(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format,
-            name=self._name)
-        return out
-
-
-class Conv2d(_ConvNd):
-    """
-    This interface is used to construct a callable object of the ``Conv2d`` class.
-    For more details, refer to code examples.
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    the feature map, H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of output feature map,
-    C is the number of input feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-
-    ..  math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
-    Parameters:
-        in_channels(int): The number of input channels in the input image.
-        out_channels(int): The number of output channels produced by the convolution.
-        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. The default value is 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. The default value is 1.
-        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        data_format(str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-
-    Attribute:
-
-        **weight** (Parameter): the learnable weights of filter of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Shape:
-
-        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        ..  math::
-
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
-
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
-
-    Examples:
-
-        .. code-block:: python
-
-          import numpy as np
-          import paddle
-          import paddle.nn as nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          
-          paddle.disable_static()
-          x_var = paddle.to_tensor(x)
-          conv = nn.Conv2d(4, 6, (3, 3))
-          y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          
-          # (2, 6, 6, 6)
-    """
-
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -419,13 +228,13 @@ class Conv2d(_ConvNd):
                  padding_mode='zeros',
                  weight_attr=None,
                  bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2d, self).__init__(
+                 data_format="NCL"):
+        super(Conv1d, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
             False,
-            2,
+            1,
             stride=stride,
             padding=padding,
             padding_mode=padding_mode,
@@ -436,25 +245,20 @@ class Conv2d(_ConvNd):
             data_format=data_format)
 
     def forward(self, x):
-        if self._padding_mode != 'zeros':
+        padding = 0
+        if self._padding_mode != "zeros":
             x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
+                      self._padding,
                       mode=self._padding_mode,
                       data_format=self._data_format)
-            return F.conv2d(
-                x,
-                self.weight,
-                bias=self.bias,
-                stride=self._stride,
-                dilation=self._dilation,
-                groups=self._groups,
-                data_format=self._data_format)
+        else:
+            padding = self._padding
 
-        out = F.conv2d(
+        out = F.conv1d(
             x,
             self.weight,
             bias=self.bias,
-            padding=self._padding,
+            padding=padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
@@ -462,7 +266,7 @@ class Conv2d(_ConvNd):
         return out
 
 
-class ConvTranspose1d(layers.Layer):
+class ConvTranspose1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
     For more details, refer to code examples.
@@ -603,34 +407,24 @@ class ConvTranspose1d(layers.Layer):
                  padding=0,
                  output_padding=0,
                  groups=1,
-                 bias=True,
                  dilation=1,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(ConvTranspose1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
-        self._param_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._groups = groups
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._output_padding = output_padding
-        self._data_format = data_format
-        self._bias = bias
-
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._padding = padding
-
-        filter_shape = [self._in_channels, out_channels // groups
-                        ] + self._kernel_size
-        self.weight = self.create_parameter(
-            shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if self._bias else None
+        super(ConvTranspose1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            1,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
 
     def forward(self, x, output_size=None):
         out = F.conv_transpose1d(
@@ -638,7 +432,169 @@ class ConvTranspose1d(layers.Layer):
             self.weight,
             bias=self.bias,
             output_size=output_size,
-            output_padding=self._output_padding,
+            output_padding=self.output_padding,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
+    For more details, refer to code examples.
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    the feature map, H is the height of the feature map, and W is the width of the feature map.
+    Filter's shape is [MCHW] , where M is the number of output feature map,
+    C is the number of input feature map, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input feature map divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
+    Parameters:
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
+            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Attribute:
+
+        **weight** (Parameter): the learnable weights of filter of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+    Examples:
+
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          import paddle.nn as nn
+          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
+          
+          # (2, 6, 6, 6)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
@@ -920,8 +876,8 @@ class Conv3d(_ConvNd):
                  in_channels,
                  out_channels,
                  kernel_size,
-                 padding=0,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',