diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index 9eb34758fec4a35a25e5b6811d5603eea7015535..23106cb076d28f33044d09bdda710cd4eaf1a9b1 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -122,7 +122,7 @@ def conv1d(
     Refer to :class:`~.Conv1d` for more information.
 
     :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel
+    :param weight: The convolution kernel.
     :param bias: The bias added to the result of convolution (if given)
     :param stride: Stride of the 1D convolution operation. Default: 1
     :param padding: Size of the paddings added to the input on both sides of its
@@ -132,7 +132,7 @@ def conv1d(
         so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``.
+        in_channels // groups, kernel_size)``. Default: 1
     :type conv_mode: string or :class:`mgb.opr_param_defs.Convolution.Mode`
     :param conv_mode: Supports 'cross_correlation'. Default:
         'cross_correlation'.
@@ -209,8 +209,8 @@ def conv2d(
     :param groups: number of groups into which the input and output channels are divided, 
         so as to perform a ``grouped convolution``. When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`.
+        and the shape of weight should be ``(groups, out_channel // groups,
+        in_channels // groups, height, width)``. Default: 1
     :type conv_mode: string or :class:`Convolution.Mode`
     :param conv_mode: supports "cross_correlation". Default:
         "cross_correlation"
@@ -277,8 +277,8 @@ def conv3d(
     :param groups: number of groups into which the input and output channels are divided,
         so as to perform a ``grouped convolution``. When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, t, height, width)`.
+        and the shape of weight should be ``(groups, out_channel // groups,
+        in_channels // groups, depth, height, width)``. Default: 1
     :param conv_mode: supports "cross_correlation". Default:
         "cross_correlation"
     :return: output tensor.
@@ -339,8 +339,8 @@ def conv_transpose2d(
     :param groups: number of groups into which the input and output channels are divided, 
         so as to perform a ``grouped convolution``. When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by groups,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`. Default: 1
+        and the shape of weight should be ``(groups, in_channels // groups,
+        out_channels // groups, height, width)``. Default: 1
     :type conv_mode: string or :class:`Convolution.Mode`
     :param conv_mode: supports "cross_correlation". Default:
         "cross_correlation"
@@ -409,8 +409,8 @@ def deformable_conv2d(
     :param groups: number of groups into which the input and output channels are divided, 
         so as to perform a ``grouped convolution``. When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by groups,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`. Default: 1
+        and the shape of weight should be ``(groups, out_channel // groups,
+        in_channels // groups, height, width)``. Default: 1
     :type conv_mode: string or :class:`Convolution.Mode`
     :param conv_mode: supports "cross_correlation". Default:
         "cross_correlation"
@@ -498,13 +498,14 @@ def conv_transpose3d(
     dilation: Union[int, Tuple[int, int, int]] = 1,
 ) -> Tensor:
     """
-    3D transposed convolution operation. Only support the case that group = 1 
+    3D transposed convolution operation. Only support the case that groups = 1 
     and conv_mode = "cross_correlation".
 
     Refer to :class:`~.ConvTranspose3d` for more information.
 
     :param inp: feature map of the convolution operation.
     :param weight: convolution kernel.
+        weight usually has shape ``(in_channels, out_channels, depth, height, width)``.
     :param bias: bias added to the result of convolution (if given).
     :param stride: stride of the 3D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on all sides of its
diff --git a/imperative/python/megengine/module/conv.py b/imperative/python/megengine/module/conv.py
index b7c6007d2eaddb75a2a7211fd1c2eb055080c43a..aef97d858a0dc0178132fa0fd476fc0be30618f4 100644
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -113,7 +113,6 @@ class Conv1d(_ConvNd):
     :math:`N` is batch size, :math:`C` denotes number of channels, and
     :math:`H` is length of 1D data element.
 
-
     When `groups == in_channels` and `out_channels == K * in_channels`,
     where K is a positive integer, this operation is also known as depthwise
     convolution.
@@ -124,10 +123,8 @@ class Conv1d(_ConvNd):
 
     :param in_channels: number of input channels.
     :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
-        an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
-    :param stride: stride of the 1D convolution operation. Default: 1
+    :param kernel_size: size of weight on spatial dimensions.
+    :param stride: stride of the 1D convolution operation.
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 1D convolution operation. Default: 1
@@ -135,8 +132,7 @@ class Conv1d(_ConvNd):
         so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
-        out_channel // groups, in_channels // groups, *kernel_size)`.
+        shape. Default: 1
     :param bias: whether to add a bias onto the result of convolution. Default:
         True
     :param conv_mode: Supports `cross_correlation`. Default:
@@ -146,6 +142,12 @@ class Conv1d(_ConvNd):
         "float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of float16 dtype.
 
+    .. note::
+
+        :attr:`weight` usually has shape ``(out_channels, in_channels, kernel_size)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, kernel_size)``
+        :attr:`bias` usually has shape ``(1, out_channels, 1)``
+
     Examples:
 
     .. testcode::
@@ -215,7 +217,7 @@ class Conv1d(_ConvNd):
 
         assert (
             ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
             ichl, ochl, group
         )
         # Assume format is NCH(W=1)
@@ -286,7 +288,7 @@ class Conv2d(_ConvNd):
     :param out_channels: number of output channels.
     :param kernel_size: size of weight on spatial dimensions. If kernel_size is
         an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
+        ``(kernel_size, kernel_size)``.
     :param stride: stride of the 2D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
@@ -295,8 +297,7 @@ class Conv2d(_ConvNd):
         so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
-        out_channel // groups, in_channels // groups, *kernel_size)`.
+        shape. Default: 1
     :param bias: whether to add a bias onto the result of convolution. Default:
         True
     :param conv_mode: Supports `cross_correlation`. Default:
@@ -306,6 +307,12 @@ class Conv2d(_ConvNd):
         "float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of float16 dtype.
 
+    .. note::
+
+        :attr:`weight` usually has shape ``(out_channels, in_channels, height, width)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
+
     Examples:
 
     .. testcode::
@@ -375,7 +382,7 @@ class Conv2d(_ConvNd):
 
         assert (
             ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
             ichl, ochl, group
         )
         # Assume format is NCHW
@@ -417,8 +424,7 @@ class Conv3d(_ConvNd):
         \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
 
     where :math:`\star` is the valid 3D cross-correlation operator,
-    :math:`N` is batch size, :math:`C` denotes number of channels
-
+    :math:`N` is batch size, :math:`C` denotes number of channels.
 
     When `groups == in_channels` and `out_channels == K * in_channels`,
     where K is a positive integer, this operation is also known as depthwise
@@ -432,7 +438,7 @@ class Conv3d(_ConvNd):
     :param out_channels: number of output channels.
     :param kernel_size: size of weight on spatial dimensions. If kernel_size is
         an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size, kernel_size)`. Default: 1
+        `(kernel_size, kernel_size, kernel_size)`.
     :param stride: stride of the 3D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
@@ -441,13 +447,18 @@ class Conv3d(_ConvNd):
         so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
-        out_channel // groups, in_channels // groups, *kernel_size)`.
+        shape. Default: 1
     :param bias: whether to add a bias onto the result of convolution. Default:
         True
     :param conv_mode: Supports `cross_correlation`. Default:
         `cross_correlation`
 
+    .. note::
+
+        :attr:`weight` usually has shape ``(out_channels, in_channels, depth, height, width)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, depth, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
+
     Examples:
 
     .. testcode::
@@ -513,7 +524,7 @@ class Conv3d(_ConvNd):
 
         assert (
             ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
             ichl, ochl, group
         )
         # Assume format is NCTHW
@@ -555,7 +566,7 @@ class ConvTranspose2d(_ConvNd):
     :param out_channels: number of output channels.
     :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
         an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
+        ``(kernel_size, kernel_size)``.
     :param stride: stride of the 2D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
@@ -564,8 +575,7 @@ class ConvTranspose2d(_ConvNd):
         so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be ``(groups,
-        out_channels // groups, in_channels // groups, *kernel_size)``. Default: 1
+        shape. Default: 1
     :param bias: wether to add a bias onto the result of convolution. Default:
         True
     :param conv_mode: Supports `cross_correlation`. Default:
@@ -574,6 +584,12 @@ class ConvTranspose2d(_ConvNd):
         placed on the precision of intermediate results. When set to "float32",
         "float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of float16 dtype.
+
+    .. note::
+
+        :attr:`weight` usually has shape ``(in_channels, out_channels, height, width)``,
+            if groups is not 1, shape will be ``(groups, in_channels // groups, out_channels // groups, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
@@ -624,7 +640,7 @@ class ConvTranspose2d(_ConvNd):
 
         assert (
             ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
             ichl, ochl, group
         )
         # Assume format is NCHW
@@ -659,15 +675,19 @@ class LocalConv2d(Conv2d):
     :param input_width: the width of the input images.
     :param kernel_size: size of weight on spatial dimensions. If kernel_size is
         an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
+        ``(kernel_size, kernel_size)``.
     :param stride: stride of the 2D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param groups: number of groups into which the input and output channels are divided,
         so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
-        The shape of weight is `(groups, output_height, output_width,
-        in_channels // groups, *kernel_size, out_channels // groups)`.
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``. Default: 1
+
+    .. note::
+
+        :attr:`weight` usually has shape ``(out_height, out_width, in_channels, height, width, in_channels)``,
+            if groups is not 1, shape will be ``(groups, out_height, out_width, in_channels // groups, height, width, out_channels // groups)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
@@ -700,17 +720,17 @@ class LocalConv2d(Conv2d):
 
     def _infer_weight_shape(self):
         group = self.groups
-        output_height = (
+        out_height = (
             self.input_height + self.padding[0] * 2 - self.kernel_size[0]
         ) // self.stride[0] + 1
-        output_width = (
+        out_width = (
             self.input_width + self.padding[1] * 2 - self.kernel_size[1]
         ) // self.stride[1] + 1
         # Assume format is NCHW
         return (
             group,
-            output_height,
-            output_width,
+            out_height,
+            out_width,
             self.in_channels // group,
             self.kernel_size[0],
             self.kernel_size[1],
@@ -747,7 +767,7 @@ class DeformableConv2d(_ConvNd):
     :param out_channels: number of output channels.
     :param kernel_size: size of weight on spatial dimensions. If kernel_size is
         an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
+        ``(kernel_size, kernel_size)``.
     :param stride: stride of the 2D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on both sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
@@ -756,8 +776,7 @@ class DeformableConv2d(_ConvNd):
         so as to perform a "grouped convolution". When ``groups`` is not 1,
         ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
         and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
-        out_channel // groups, in_channels // groups, *kernel_size)`.
+        shape. Default: 1
     :param bias: whether to add a bias onto the result of convolution. Default:
         True
     :param conv_mode: Supports `cross_correlation`. Default:
@@ -766,6 +785,13 @@ class DeformableConv2d(_ConvNd):
         placed on the precision of intermediate results. When set to "float32",
         "float32" would be used for accumulator and intermediate result, but only
         effective when input and output are of float16 dtype.
+
+    .. note::
+
+        :attr:`weight` usually has shape ``(out_channels, in_channels, height, width)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
+
     """
 
     def __init__(
@@ -816,7 +842,7 @@ class DeformableConv2d(_ConvNd):
 
         assert (
             ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
             ichl, ochl, group
         )
         # Assume format is NCHW
@@ -849,7 +875,7 @@ class ConvTranspose3d(_ConvNd):
     r"""
     Applies a 3D transposed convolution over an input tensor.
 
-    Only support the case that group = 1 and conv_mode = "cross_correlation".
+    Only support the case that groups = 1 and conv_mode = "cross_correlation".
 
     :class:`ConvTranspose3d` can be seen as the gradient of :class:`Conv3d` operation
     with respect to its input.
@@ -862,13 +888,18 @@ class ConvTranspose3d(_ConvNd):
     :param out_channels: number of output channels.
     :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
         an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size, kernel_size)``. Default: 1
+        ``(kernel_size, kernel_size, kernel_size)``.
     :param stride: stride of the 3D convolution operation. Default: 1
     :param padding: size of the paddings added to the input on all sides of its
         spatial dimensions. Only zero-padding is supported. Default: 0
     :param dilation: dilation of the 3D convolution operation. Default: 1
     :param bias: wether to add a bias onto the result of convolution. Default:
         True
+
+    .. note::
+
+        :attr:`weight` usually has shape ``(in_channels, out_channels, depth, height, width)``.
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
     """
 
     def __init__(
diff --git a/imperative/python/test/unit/module/test_conv.py b/imperative/python/test/unit/module/test_conv.py
index 0f99608c3925f23d7d7dd8b41a59ffc24eaded27..b36c2496b2746c257037051a9ff5a4dbc95ddea1 100644
--- a/imperative/python/test/unit/module/test_conv.py
+++ b/imperative/python/test/unit/module/test_conv.py
@@ -80,18 +80,7 @@ def test_local_conv2d():
         ).astype(np.float32)
         output_height = (input_height + padding * 2 - kernel_size) // stride + 1
         output_width = (input_width + padding * 2 - kernel_size) // stride + 1
-        weights = np.random.normal(
-            size=(
-                groups,
-                output_height,
-                output_width,
-                in_channels // groups,
-                kernel_size,
-                kernel_size,
-                out_channels // groups,
-            )
-        ).astype(np.float32)
-        local_conv2d.weight = Parameter(weights)
+        weights = local_conv2d.weight.numpy()
         outputs = local_conv2d(tensor(inputs))
         # naive calculation use numpy
         # only test output_height == input_height, output_width == input_width