docs(mge/module): add note about weight shape in conv

GitOrigin-RevId: 43e1f15968b7ecce70ff813c03c803d0d078c471

docs(mge/module): add note about weight shape in conv
GitOrigin-RevId: 43e1f15968b7ecce70ff813c03c803d0d078c471
7b68bf77 · Megvii Engine Team · 9d439ae6 · 7b68bf77 · 7b68bf77 · 7b68bf77
3 changed file
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -122,7 +122,7 @@ def conv1d(
    Refer to :class:`~.Conv1d` for more information.
    :param inp: The feature map of the convolution operation
-    :param weight: The convolution kernel
+    :param weight: The convolution kernel.
    :param bias: The bias added to the result of convolution (if given)
    :param stride: Stride of the 1D convolution operation. Default: 1
    :param padding: Size of the paddings added to the input on both sides of its
@@ -132,7 +132,7 @@ def conv1d(
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)``.
+        in_channels // groups, kernel_size)``. Default: 1
    :type conv_mode: string or :class:`mgb.opr_param_defs.Convolution.Mode`
    :param conv_mode: Supports 'cross_correlation'. Default:
        'cross_correlation'.
@@ -209,8 +209,8 @@ def conv2d(
    :param groups: number of groups into which the input and output channels are divided, 
        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
+        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)`.
+        in_channels // groups, height, width)``. Default: 1
    :type conv_mode: string or :class:`Convolution.Mode`
    :param conv_mode: supports "cross_correlation". Default:
        "cross_correlation"
@@ -277,8 +277,8 @@ def conv3d(
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
+        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, t, height, width)`.
+        in_channels // groups, depth, height, width)``. Default: 1
    :param conv_mode: supports "cross_correlation". Default:
        "cross_correlation"
    :return: output tensor.
@@ -339,8 +339,8 @@ def conv_transpose2d(
    :param groups: number of groups into which the input and output channels are divided, 
        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by groups,
-        and the shape of weight should be `(groups, out_channel // groups,
+        and the shape of weight should be ``(groups, in_channels // groups,
-        in_channels // groups, height, width)`. Default: 1
+        out_channels // groups, height, width)``. Default: 1
    :type conv_mode: string or :class:`Convolution.Mode`
    :param conv_mode: supports "cross_correlation". Default:
        "cross_correlation"
@@ -409,8 +409,8 @@ def deformable_conv2d(
    :param groups: number of groups into which the input and output channels are divided, 
        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by groups,
-        and the shape of weight should be `(groups, out_channel // groups,
+        and the shape of weight should be ``(groups, out_channel // groups,
-        in_channels // groups, height, width)`. Default: 1
+        in_channels // groups, height, width)``. Default: 1
    :type conv_mode: string or :class:`Convolution.Mode`
    :param conv_mode: supports "cross_correlation". Default:
        "cross_correlation"
@@ -498,13 +498,14 @@ def conv_transpose3d(
    dilation: Union[int, Tuple[int, int, int]] = 1,
 ) -> Tensor:
    """
-    3D transposed convolution operation. Only support the case that group = 1 
+    3D transposed convolution operation. Only support the case that groups = 1 
    and conv_mode = "cross_correlation".
    Refer to :class:`~.ConvTranspose3d` for more information.
    :param inp: feature map of the convolution operation.
    :param weight: convolution kernel.
+        weight usually has shape ``(in_channels, out_channels, depth, height, width)``.
    :param bias: bias added to the result of convolution (if given).
    :param stride: stride of the 3D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on all sides of its

--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -113,7 +113,6 @@ class Conv1d(_ConvNd):
    :math:`N` is batch size, :math:`C` denotes number of channels, and
    :math:`H` is length of 1D data element.
    When `groups == in_channels` and `out_channels == K * in_channels`,
    where K is a positive integer, this operation is also known as depthwise
    convolution.
@@ -124,10 +123,8 @@ class Conv1d(_ConvNd):
    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
-    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
+    :param kernel_size: size of weight on spatial dimensions.
-        an :class:`int`, the actual kernel size would be
+    :param stride: stride of the 1D convolution operation.
-        `(kernel_size, kernel_size)`. Default: 1
-    :param stride: stride of the 1D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 1D convolution operation. Default: 1
@@ -135,8 +132,7 @@ class Conv1d(_ConvNd):
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
+        shape. Default: 1
-        out_channel // groups, in_channels // groups, *kernel_size)`.
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
@@ -146,6 +142,12 @@ class Conv1d(_ConvNd):
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.
+    .. note::
+        :attr:`weight` usually has shape ``(out_channels, in_channels, kernel_size)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, kernel_size)``
+        :attr:`bias` usually has shape ``(1, out_channels, 1)``
    Examples:
    .. testcode::
@@ -215,7 +217,7 @@ class Conv1d(_ConvNd):
        assert (
            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
            ichl, ochl, group
        )
        # Assume format is NCH(W=1)
@@ -286,7 +288,7 @@ class Conv2d(_ConvNd):
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
+        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
@@ -295,8 +297,7 @@ class Conv2d(_ConvNd):
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
+        shape. Default: 1
-        out_channel // groups, in_channels // groups, *kernel_size)`.
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
@@ -306,6 +307,12 @@ class Conv2d(_ConvNd):
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.
+    .. note::
+        :attr:`weight` usually has shape ``(out_channels, in_channels, height, width)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
    Examples:
    .. testcode::
@@ -375,7 +382,7 @@ class Conv2d(_ConvNd):
        assert (
            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
            ichl, ochl, group
        )
        # Assume format is NCHW
@@ -417,8 +424,7 @@ class Conv3d(_ConvNd):
        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
    where :math:`\star` is the valid 3D cross-correlation operator,
-    :math:`N` is batch size, :math:`C` denotes number of channels
+    :math:`N` is batch size, :math:`C` denotes number of channels.
    When `groups == in_channels` and `out_channels == K * in_channels`,
    where K is a positive integer, this operation is also known as depthwise
@@ -432,7 +438,7 @@ class Conv3d(_ConvNd):
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size, kernel_size)`. Default: 1
+        `(kernel_size, kernel_size, kernel_size)`.
    :param stride: stride of the 3D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
@@ -441,13 +447,18 @@ class Conv3d(_ConvNd):
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
+        shape. Default: 1
-        out_channel // groups, in_channels // groups, *kernel_size)`.
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
        `cross_correlation`
+    .. note::
+        :attr:`weight` usually has shape ``(out_channels, in_channels, depth, height, width)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, depth, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
    Examples:
    .. testcode::
@@ -513,7 +524,7 @@ class Conv3d(_ConvNd):
        assert (
            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
            ichl, ochl, group
        )
        # Assume format is NCTHW
@@ -555,7 +566,7 @@ class ConvTranspose2d(_ConvNd):
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size)``. Default: 1
+        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
@@ -564,8 +575,7 @@ class ConvTranspose2d(_ConvNd):
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be ``(groups,
+        shape. Default: 1
-        out_channels // groups, in_channels // groups, *kernel_size)``. Default: 1
    :param bias: wether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
@@ -574,6 +584,12 @@ class ConvTranspose2d(_ConvNd):
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.
+    .. note::
+        :attr:`weight` usually has shape ``(in_channels, out_channels, height, width)``,
+            if groups is not 1, shape will be ``(groups, in_channels // groups, out_channels // groups, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
    """
    def __init__(
@@ -624,7 +640,7 @@ class ConvTranspose2d(_ConvNd):
        assert (
            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
            ichl, ochl, group
        )
        # Assume format is NCHW
@@ -659,15 +675,19 @@ class LocalConv2d(Conv2d):
    :param input_width: the width of the input images.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
+        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``. Default: 1
-        The shape of weight is `(groups, output_height, output_width,
-        in_channels // groups, *kernel_size, out_channels // groups)`.
+    .. note::
+        :attr:`weight` usually has shape ``(out_height, out_width, in_channels, height, width, in_channels)``,
+            if groups is not 1, shape will be ``(groups, out_height, out_width, in_channels // groups, height, width, out_channels // groups)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
    """
    def __init__(
@@ -700,17 +720,17 @@ class LocalConv2d(Conv2d):
    def _infer_weight_shape(self):
        group = self.groups
-        output_height = (
+        out_height = (
            self.input_height + self.padding[0] * 2 - self.kernel_size[0]
        ) // self.stride[0] + 1
-        output_width = (
+        out_width = (
            self.input_width + self.padding[1] * 2 - self.kernel_size[1]
        ) // self.stride[1] + 1
        # Assume format is NCHW
        return (
            group,
-            output_height,
+            out_height,
-            output_width,
+            out_width,
            self.in_channels // group,
            self.kernel_size[0],
            self.kernel_size[1],
@@ -747,7 +767,7 @@ class DeformableConv2d(_ConvNd):
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
-        `(kernel_size, kernel_size)`. Default: 1
+        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
@@ -756,8 +776,7 @@ class DeformableConv2d(_ConvNd):
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
-        shape. Specifically, the shape of weight would be `(groups,
+        shape. Default: 1
-        out_channel // groups, in_channels // groups, *kernel_size)`.
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
@@ -766,6 +785,13 @@ class DeformableConv2d(_ConvNd):
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.
+    .. note::
+        :attr:`weight` usually has shape ``(out_channels, in_channels, height, width)``,
+            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
    """
    def __init__(
@@ -816,7 +842,7 @@ class DeformableConv2d(_ConvNd):
        assert (
            ichl % group == 0 and ochl % group == 0
-        ), "invalid config: input_channels={} output_channels={} group={}".format(
+        ), "invalid config: in_channels={} out_channels={} group={}".format(
            ichl, ochl, group
        )
        # Assume format is NCHW
@@ -849,7 +875,7 @@ class ConvTranspose3d(_ConvNd):
    r"""
    Applies a 3D transposed convolution over an input tensor.
-    Only support the case that group = 1 and conv_mode = "cross_correlation".
+    Only support the case that groups = 1 and conv_mode = "cross_correlation".
    :class:`ConvTranspose3d` can be seen as the gradient of :class:`Conv3d` operation
    with respect to its input.
@@ -862,13 +888,18 @@ class ConvTranspose3d(_ConvNd):
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
        an :class:`int`, the actual kernel size would be
-        ``(kernel_size, kernel_size, kernel_size)``. Default: 1
+        ``(kernel_size, kernel_size, kernel_size)``.
    :param stride: stride of the 3D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on all sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 3D convolution operation. Default: 1
    :param bias: wether to add a bias onto the result of convolution. Default:
        True
+    .. note::
+        :attr:`weight` usually has shape ``(in_channels, out_channels, depth, height, width)``.
+        :attr:`bias` usually has shape ``(1, out_channels, *1)``
    """
    def __init__(

--- a/imperative/python/test/unit/module/test_conv.py
+++ b/imperative/python/test/unit/module/test_conv.py
@@ -80,18 +80,7 @@ def test_local_conv2d():
        ).astype(np.float32)
        output_height = (input_height + padding * 2 - kernel_size) // stride + 1
        output_width = (input_width + padding * 2 - kernel_size) // stride + 1
-        weights = np.random.normal(
+        weights = local_conv2d.weight.numpy()
-            size=(
-                groups,
-                output_height,
-                output_width,
-                in_channels // groups,
-                kernel_size,
-                kernel_size,
-                out_channels // groups,
-            )
-        ).astype(np.float32)
-        local_conv2d.weight = Parameter(weights)
        outputs = local_conv2d(tensor(inputs))
        # naive calculation use numpy
        # only test output_height == input_height, output_width == input_width