feat(mge/opr): support deformable conv2d/psroi_pooling

GitOrigin-RevId: 501cadda76b84929af51db134f4b74dd73e1c2a3

feat(mge/opr): support deformable conv2d/psroi_pooling
GitOrigin-RevId: 501cadda76b84929af51db134f4b74dd73e1c2a3
cd02d7c8 · Megvii Engine Team · 227d5b9f · cd02d7c8 · cd02d7c8 · cd02d7c8
9 changed file
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -44,6 +44,8 @@ __all__ = [
    "batch_norm",
    "conv2d",
    "conv_transpose2d",
+    "deformable_conv2d",
+    "deformable_psroi_pooling",
    "dot",
    "dropout",
    "indexing_one_hot",
@@ -119,7 +121,8 @@ def conv2d(
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided, so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+    :param groups: number of groups into which the input and output channels are divided, 
+        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`.
@@ -141,7 +144,6 @@ def conv2d(
    pad_h, pad_w = expand_hw(padding)
    dilate_h, dilate_w = expand_hw(dilation)

-    Sparse = builtin.Convolution.Sparse
    sparse_type = "DENSE" if groups == 1 else "GROUP"
    op = builtin.Convolution(
        stride_h=stride_h,
@@ -185,7 +187,8 @@ def conv_transpose2d(
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided, so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+    :param groups: number of groups into which the input and output channels are divided, 
+        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by groups,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`. Default: 1
@@ -226,6 +229,74 @@ def conv_transpose2d(
    return output


+def deformable_conv2d(
+    inp: Tensor,
+    weight: Tensor,
+    offset: Tensor,
+    mask: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    groups: int = 1,
+    conv_mode="CROSS_CORRELATION",
+    compute_mode="DEFAULT",
+) -> Tensor:
+    """
+    Deformable Convolution.
+
+    :param inp: input feature map.
+    :param weight: convolution kernel.
+    :param offset: input offset to kernel, channel of this tensor should match the deformable settings.
+    :param mask: input mask to kernel, channel of this tensor should match the deformable settings.
+    :param bias: bias added to the result of convolution (if given).
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups into which the input and output channels are divided, 
+        so as to perform a ``grouped convolution``. When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by groups,
+        and the shape of weight should be `(groups, out_channel // groups,
+        in_channels // groups, height, width)`. Default: 1
+    :type conv_mode: string or :class:`Convolution.Mode`
+    :param conv_mode: supports "CROSS_CORRELATION". Default:
+        "CROSS_CORRELATION"
+    :type compute_mode: string or
+        :class:`Convolution.ComputeMode`
+    :param compute_mode: when set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only
+        effective when input and output are of Float16 dtype.
+    :return: output tensor.
+    """
+    assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"
+    assert compute_mode == "DEFAULT" or compute_mode.name == "DEFAULT"
+
+    stride_h, stride_w = expand_hw(stride)
+    pad_h, pad_w = expand_hw(padding)
+    dilate_h, dilate_w = expand_hw(dilation)
+
+    sparse_type = "DENSE" if groups == 1 else "GROUP"
+    op = builtin.DeformableConv(
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=pad_h,
+        pad_w=pad_w,
+        dilate_h=dilate_h,
+        dilate_w=dilate_w,
+        strategy=get_conv_execution_strategy(),
+        mode=conv_mode,
+        compute_mode=compute_mode,
+        sparse=sparse_type,
+    )
+    inp, weight, offset, mask = utils.convert_inputs(inp, weight, offset, mask)
+    (output,) = apply(op, inp, weight, offset, mask)
+    if bias is not None:
+        output += bias
+    return output
+
+
 def local_conv2d(
    inp: Tensor,
    weight: Tensor,
@@ -380,6 +451,45 @@ def adaptive_avg_pool2d(
    return output


+def deformable_psroi_pooling(
+    inp: Tensor,
+    rois: Tensor,
+    trans: Tensor,
+    no_trans: bool,
+    part_size: int,
+    pooled_h: int,
+    pooled_w: int,
+    sample_per_part: int,
+    spatial_scale: float,
+    trans_std: float = 0.1,
+):
+    """
+    Deformable PSROI(Position Sensitive Region of Interest) Pooling.
+
+    :param inp: input feature map.
+    :param rois: the rois for feature pooling.
+    :param trans: input offset to psroi_pooling.
+    :param no_trans: check the phase of DeformablePSROIPooling. False to the
+                        1st phase, True to the 2nd phase.
+    :param part_size: part size.
+    :param sample_per_part: sample points of each part.
+    :param pooled_shape: kernel shape of convolution.
+    :param spatial_scale: the spatial_scale w.r.t input image.
+    :param trans_std: multiplier used in 2nd phase.
+    """
+    op = builtin.DeformablePSROIPooling(
+        no_trans=no_trans,
+        part_size=part_size,
+        pooled_h=pooled_h,
+        pooled_w=pooled_w,
+        sample_per_part=sample_per_part,
+        spatial_scale=spatial_scale,
+        trans_std=trans_std,
+    )
+    output, _ = apply(op, inp, rois, trans)
+    return output
+
+
 def prelu(inp: Tensor, weight: Tensor) -> Tensor:
    r"""
    Applies the element-wise PReLU function.

--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
@@ -12,8 +12,16 @@ from .adaptive_pooling import AdaptiveAvgPool2d, AdaptiveMaxPool2d
 from .batch_matmul_activation import BatchMatMulActivation
 from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
 from .concat import Concat
-from .conv import Conv1d, Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d
+from .conv import (
+    Conv1d,
+    Conv2d,
+    ConvRelu2d,
+    ConvTranspose2d,
+    DeformableConv2d,
+    LocalConv2d,
+)
 from .conv_bn import ConvBn2d, ConvBnRelu2d
+from .deformable_psroi_pooling import DeformablePSROIPooling
 from .dropout import Dropout
 from .elemwise import Elemwise
 from .embedding import Embedding

--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -10,7 +10,14 @@ from typing import Tuple, Union

 import numpy as np

-from ..functional import conv1d, conv2d, conv_transpose2d, local_conv2d, relu
+from ..functional import (
+    conv1d,
+    conv2d,
+    conv_transpose2d,
+    deformable_conv2d,
+    local_conv2d,
+    relu,
+)
 from ..tensor import Parameter
 from ..utils.tuple_function import _pair, _pair_nonzero
 from . import init
@@ -121,7 +128,8 @@ class Conv1d(_ConvNd):
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 1D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+    :param groups: number of groups into which the input and output channels are divided,
+    so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Specifically, the shape of weight would be `(groups,
@@ -250,15 +258,16 @@ class Conv2d(_ConvNd):
    In general, output feature maps' shapes can be inferred as follows:

    input: :math:`(N, C_{\text{in}}, H_{\text{in}}, W_{\text{in}})`
+
    output: :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` where

    .. math::
        \text{H}_{out} = \lfloor \frac{\text{H}_{in} + 2 * \text{padding[0]} - 
-        \text{dilation[0]} * (\text{kernel_size[0]} - 1)}{\text{stride[0]}} + 1 \rfloor
+        \text{dilation[0]} * (\text{kernel_size[0]} - 1) - 1}{\text{stride[0]}} + 1 \rfloor

    .. math::
        \text{W}_{out} = \lfloor \frac{\text{W}_{in} + 2 * \text{padding[1]} - 
-        \text{dilation[1]} * (\text{kernel_size[1]} - 1)}{\text{stride[1]}} + 1 \rfloor
+        \text{dilation[1]} * (\text{kernel_size[1]} - 1) - 1}{\text{stride[1]}} + 1 \rfloor

    When `groups == in_channels` and `out_channels == K * in_channels`,
    where K is a positive integer, this operation is also known as depthwise
@@ -277,7 +286,8 @@ class Conv2d(_ConvNd):
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+    :param groups: number of groups into which the input and output channels are divided,
+    so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Specifically, the shape of weight would be `(groups,
@@ -406,7 +416,8 @@ class ConvTranspose2d(_ConvNd):
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+    :param groups: number of groups into which the input and output channels are divided,
+    so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Specifically, the shape of weight would be ``(groups,
@@ -579,3 +590,107 @@ class ConvRelu2d(Conv2d):

    def forward(self, inp):
        return relu(self.calc_conv(inp, self.weight, self.bias))
+
+
+class DeformableConv2d(_ConvNd):
+    """
+    Deformable Convolution.
+
+    :param in_channels: number of input channels.
+    :param out_channels: number of output channels.
+    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
+        an :class:`int`, the actual kernel size would be
+        `(kernel_size, kernel_size)`. Default: 1
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups into which the input and output channels are divided,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and there would be an extra dimension at the beginning of the weight's
+        shape. Specifically, the shape of weight would be `(groups,
+        out_channel // groups, in_channels // groups, *kernel_size)`.
+    :param bias: whether to add a bias onto the result of convolution. Default:
+        True
+    :param conv_mode: Supports `CROSS_CORRELATION`. Default:
+        `CROSS_CORRELATION`
+    :param compute_mode: When set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only
+        effective when input and output are of float16 dtype.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        conv_mode: str = "CROSS_CORRELATION",
+        compute_mode: str = "DEFAULT",
+    ):
+        kernel_size = _pair_nonzero(kernel_size)
+        stride = _pair_nonzero(stride)
+        padding = _pair(padding)
+        dilation = _pair_nonzero(dilation)
+        self.conv_mode = conv_mode
+        self.compute_mode = compute_mode
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+
+    def _get_fanin(self):
+        kh, kw = self.kernel_size
+        ic = self.in_channels
+        return kh * kw * ic
+
+    def _infer_weight_shape(self):
+        group = self.groups
+        ichl = self.in_channels
+        ochl = self.out_channels
+        kh, kw = self.kernel_size
+        if group == 1:
+            # Assume format is NCHW
+            return (ochl, ichl, kh, kw)
+
+        assert (
+            ichl % group == 0 and ochl % group == 0
+        ), "invalid config: input_channels={} output_channels={} group={}".format(
+            ichl, ochl, group
+        )
+        # Assume format is NCHW
+        return (group, ochl // group, ichl // group, kh, kw)
+
+    def _infer_bias_shape(self):
+        # Assume format is NCHW
+        return (1, self.out_channels, 1, 1)
+
+    def calc_conv(self, inp, weight, offset, mask, bias):
+        return deformable_conv2d(
+            inp,
+            weight,
+            offset,
+            mask,
+            bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.conv_mode,
+            self.compute_mode,
+        )
+
+    def forward(self, inp, offset, mask):
+        return self.calc_conv(inp, self.weight, offset, mask, self.bias)
--- a/imperative/python/megengine/module/deformable_psroi_pooling.py
+++ b/imperative/python/megengine/module/deformable_psroi_pooling.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from ..functional import deformable_psroi_pooling
+from .module import Module
+
+
+class DeformablePSROIPooling(Module):
+    def __init__(
+        self,
+        no_trans,
+        part_size,
+        pooled_h,
+        pooled_w,
+        sample_per_part,
+        spatial_scale,
+        trans_std: float = 0.1,
+    ):
+        super().__init__()
+        self.no_trans = no_trans
+        self.part_size = part_size
+        self.pooled_h = pooled_h
+        self.pooled_w = pooled_w
+        self.sample_per_part = sample_per_part
+        self.spatial_scale = spatial_scale
+        self.trans_std = trans_std
+
+    def forward(self, inp, rois, trans):
+        return deformable_psroi_pooling(
+            inp,
+            rois,
+            trans,
+            self.no_trans,
+            self.part_size,
+            self.pooled_h,
+            self.pooled_w,
+            self.sample_per_part,
+            self.spatial_scale,
+            self.trans_std,
+        )
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -703,6 +703,33 @@ def test_argmxx_on_inf():
    assert all(run_argmin() >= 0)


+def test_deformable_psroi_pooling():
+    inp = np.random.random((1, 256, 64, 64)).astype("float32")
+    rois = np.random.random((1, 5)).astype("float32")
+    trans = np.random.random((24, 2, 7, 7)).astype("float32")
+
+    pooled_h = 7
+    pooled_w = 7
+    sample_per_part = 4
+    no_trans = False
+    part_size = 7
+    spatial_scale = 1.0 / 64
+    trans_std = 0.1
+
+    y = F.deformable_psroi_pooling(
+        tensor(inp),
+        tensor(rois),
+        tensor(trans),
+        no_trans,
+        part_size,
+        pooled_h,
+        pooled_w,
+        sample_per_part,
+        spatial_scale,
+        trans_std,
+    )
+
+
 def test_cvt_color():
    def rgb2gray(rgb):
        return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])

--- a/imperative/src/impl/ops/deformable_conv2d.cpp
+++ b/imperative/src/impl/ops/deformable_conv2d.cpp
+/**
+ * \file imperative/src/impl/ops/deformable_conv2d.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "megbrain/imperative/ops/autogen.h"
+#include "megbrain/opr/dnn/convolution.h"
+
+#include "../op_trait.h"
+
+namespace mgb::imperative {
+
+namespace { namespace deformableconv {
+std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
+    auto* node = &node_->cast_final_safe<opr::DeformableConv>();
+    return DeformableConv::make(node->param(), node->execution_policy());
+}
+
+auto apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    auto&& dcn = static_cast<const DeformableConv&>(def);
+    mgb_assert(inputs.size() == 4);
+    return opr::DeformableConv::make(inputs[0], inputs[1], inputs[2], inputs[3], dcn.param(), dcn.policy());
+}
+
+OP_TRAIT_REG(DeformableConv, DeformableConv, opr::DeformableConv)
+    .make_from_op_node(make_from_op_node)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+}} // deformableconv
+
+} // namespace mgb::imperative
--- a/imperative/src/impl/ops/deformable_psroi_pooling.cpp
+++ b/imperative/src/impl/ops/deformable_psroi_pooling.cpp
+/**
+ * \file imperative/src/impl/ops/deformable_psroi_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+#include "megbrain/imperative/ops/autogen.h"
+#include "megbrain/opr/dnn/roi_pooling.h"
+
+#include "../op_trait.h"
+
+namespace mgb::imperative {
+
+namespace { namespace deformable_psroi_pooling {
+auto apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    mgb_assert(inputs.size() == 3);
+    auto&& op = static_cast<const DeformablePSROIPooling&>(def);
+    return opr::DeformablePSROIPooling::make_all(inputs[0], inputs[1], inputs[2], op.param());
+}
+
+OP_TRAIT_REG(DeformablePSROIPooling, DeformablePSROIPooling)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+}} // deformable_psroi_pooling
+
+} // namespace mgb::imperative
--- a/imperative/src/impl/ops/specializations.cpp
+++ b/imperative/src/impl/ops/specializations.cpp
 /**
- * \file imperative/src/impl/ops/autogen.cpp
+ * \file imperative/src/impl/ops/specialzations.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.

--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -48,6 +48,8 @@ def Convolution : MgbHashableOp<"Convolution", [ConvolutionParam, ExecutionPolic

 def ConvolutionBackwardData: MgbHashableOp<"ConvolutionBackwardData", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]>;

+def DeformableConv : MgbHashableOp<"DeformableConv", [ConvolutionParam, ExecutionPolicyParamBase<"policy">]>;
+
 def GroupLocal: MgbHashableOp<"GroupLocal", [ConvolutionParam]>;

 def Pooling: MgbHashableOp<"Pooling", [PoolingParam]>;
@@ -56,6 +58,8 @@ def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]>;

 def ROIPooling: MgbHashableOp<"ROIPooling", [ROIPoolingParam]>;

+def DeformablePSROIPooling : MgbHashableOp<"DeformablePSROIPooling", [DeformablePSROIPoolingParam]>;
+
 def ConvBias : MgbHashableOp<"ConvBias", [ConvBiasParam, ExecutionPolicyParamBase<"policy">]> {
  let extraArguments = (ins
    MgbDTypeAttr:$dtype