From b776434c72040571878e7b077d9b715098016edd Mon Sep 17 00:00:00 2001
From: Bai Yifan <me@ethanbai.com>
Date: Mon, 7 Dec 2020 19:18:13 +0800
Subject: [PATCH] Add deform_conv2d,DeformConv2D (#29364) (#29425)

* add deform_conv2d,DeformConv2D
---
 .../tests/unittests/test_deform_conv2d.py     | 558 ++++++++++++++++++
 python/paddle/vision/ops.py                   | 389 +++++++++++-
 2 files changed, 946 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_deform_conv2d.py

diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
new file mode 100644
index 00000000000..660625c9bf7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py
@@ -0,0 +1,558 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn.initializer as I
+import numpy as np
+import unittest
+from unittest import TestCase
+
+
+class TestDeformConv2D(TestCase):
+    batch_size = 4
+    spatial_shape = (16, 16)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [0, 0]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = True
+
+    def prepare(self):
+        if isinstance(self.kernel_size, int):
+            filter_shape = (self.kernel_size, ) * 2
+        else:
+            filter_shape = tuple(self.kernel_size)
+        self.filter_shape = filter_shape
+
+        self.weight = np.random.uniform(
+            -1, 1, (self.out_channels, self.in_channels // self.groups
+                    ) + filter_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (
+                self.out_channels, )).astype(self.dtype)
+
+        def out_size(in_size, pad_size, dilation_size, kernel_size,
+                     stride_size):
+            return (in_size + 2 * pad_size -
+                    (dilation_size * (kernel_size - 1) + 1)) / stride_size + 1
+
+        out_h = int(
+            out_size(self.spatial_shape[0], self.padding[0], self.dilation[0],
+                     self.kernel_size[0], self.stride[0]))
+        out_w = int(
+            out_size(self.spatial_shape[1], self.padding[1], self.dilation[1],
+                     self.kernel_size[1], self.stride[1]))
+        out_shape = (out_h, out_w)
+
+        self.input_shape = (self.batch_size, self.in_channels
+                            ) + self.spatial_shape
+
+        self.offset_shape = (self.batch_size, 2 * filter_shape[0] *
+                             filter_shape[1]) + out_shape
+
+        self.mask_shape = (self.batch_size, filter_shape[0] * filter_shape[1]
+                           ) + out_shape
+
+        self.input = np.random.uniform(-1, 1,
+                                       self.input_shape).astype(self.dtype)
+
+        self.offset = np.random.uniform(-1, 1,
+                                        self.offset_shape).astype(self.dtype)
+
+        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
+
+    def static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype)
+            offset = paddle.static.data(
+                "offset",
+                (-1, 2 * self.filter_shape[0] * self.filter_shape[1], -1, -1),
+                dtype=self.dtype)
+            mask = paddle.static.data(
+                "mask",
+                (-1, self.filter_shape[0] * self.filter_shape[1], -1, -1),
+                dtype=self.dtype)
+
+            y_v1 = paddle.fluid.layers.deformable_conv(
+                input=x,
+                offset=offset,
+                mask=None,
+                num_filters=self.out_channels,
+                filter_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=1,
+                im2col_step=1,
+                param_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+                modulated=False)
+
+            y_v2 = paddle.fluid.layers.deformable_conv(
+                input=x,
+                offset=offset,
+                mask=mask,
+                num_filters=self.out_channels,
+                filter_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=1,
+                im2col_step=1,
+                param_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias))
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        out_v1, out_v2 = exe.run(main,
+                                 feed={
+                                     "input": self.input,
+                                     "offset": self.offset,
+                                     "mask": self.mask
+                                 },
+                                 fetch_list=[y_v1, y_v2])
+        return out_v1, out_v2
+
+    def dygraph_case_dcn(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.input)
+        offset = paddle.to_tensor(self.offset)
+        mask = paddle.to_tensor(self.mask)
+
+        bias = None if self.no_bias else paddle.to_tensor(self.bias)
+
+        deform_conv2d = paddle.vision.ops.DeformConv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+            weight_attr=I.Assign(self.weight),
+            bias_attr=False if self.no_bias else I.Assign(self.bias))
+
+        y_v1 = deform_conv2d(x, offset)
+        y_v2 = deform_conv2d(x, offset, mask)
+
+        out_v1 = y_v1.numpy()
+        out_v2 = y_v2.numpy()
+
+        return out_v1, out_v2
+
+    def _test_identity(self):
+        self.prepare()
+        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
+        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
+        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
+
+    def test_identity(self):
+        self.place = paddle.CPUPlace()
+        self._test_identity()
+
+        if paddle.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+            self._test_identity()
+
+
+class TestDeformConv2DFunctional(TestCase):
+    batch_size = 4
+    spatial_shape = (16, 16)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [0, 0]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = True
+
+    def prepare(self):
+        if isinstance(self.kernel_size, int):
+            filter_shape = (self.kernel_size, ) * 2
+        else:
+            filter_shape = tuple(self.kernel_size)
+        self.filter_shape = filter_shape
+
+        self.weight = np.random.uniform(
+            -1, 1, (self.out_channels, self.in_channels // self.groups
+                    ) + filter_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (
+                self.out_channels, )).astype(self.dtype)
+
+        def out_size(in_size, pad_size, dilation_size, kernel_size,
+                     stride_size):
+            return (in_size + 2 * pad_size -
+                    (dilation_size * (kernel_size - 1) + 1)) / stride_size + 1
+
+        out_h = int(
+            out_size(self.spatial_shape[0], self.padding[0], self.dilation[0],
+                     self.kernel_size[0], self.stride[0]))
+        out_w = int(
+            out_size(self.spatial_shape[1], self.padding[1], self.dilation[1],
+                     self.kernel_size[1], self.stride[1]))
+        out_shape = (out_h, out_w)
+
+        self.input_shape = (self.batch_size, self.in_channels
+                            ) + self.spatial_shape
+
+        self.offset_shape = (self.batch_size, 2 * filter_shape[0] *
+                             filter_shape[1]) + out_shape
+
+        self.mask_shape = (self.batch_size, filter_shape[0] * filter_shape[1]
+                           ) + out_shape
+
+        self.input = np.random.uniform(-1, 1,
+                                       self.input_shape).astype(self.dtype)
+
+        self.offset = np.random.uniform(-1, 1,
+                                        self.offset_shape).astype(self.dtype)
+
+        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
+
+    def static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype)
+            offset = paddle.static.data(
+                "offset",
+                (-1, 2 * self.filter_shape[0] * self.filter_shape[1], -1, -1),
+                dtype=self.dtype)
+            mask = paddle.static.data(
+                "mask",
+                (-1, self.filter_shape[0] * self.filter_shape[1], -1, -1),
+                dtype=self.dtype)
+
+            y_v1 = paddle.fluid.layers.deformable_conv(
+                input=x,
+                offset=offset,
+                mask=None,
+                num_filters=self.out_channels,
+                filter_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=1,
+                im2col_step=1,
+                param_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+                modulated=False)
+
+            y_v2 = paddle.fluid.layers.deformable_conv(
+                input=x,
+                offset=offset,
+                mask=mask,
+                num_filters=self.out_channels,
+                filter_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=1,
+                im2col_step=1,
+                param_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias))
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        out_v1, out_v2 = exe.run(main,
+                                 feed={
+                                     "input": self.input,
+                                     "offset": self.offset,
+                                     "mask": self.mask
+                                 },
+                                 fetch_list=[y_v1, y_v2])
+        return out_v1, out_v2
+
+    def dygraph_case_dcn(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.input)
+        offset = paddle.to_tensor(self.offset)
+        mask = paddle.to_tensor(self.mask)
+        weight = paddle.to_tensor(self.weight)
+        bias = None if self.no_bias else paddle.to_tensor(self.bias)
+
+        y_v1 = paddle.vision.ops.deform_conv2d(
+            x=x,
+            offset=offset,
+            weight=weight,
+            bias=bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups, )
+
+        y_v2 = paddle.vision.ops.deform_conv2d(
+            x=x,
+            offset=offset,
+            mask=mask,
+            weight=weight,
+            bias=bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups, )
+
+        out_v1 = y_v1.numpy()
+        out_v2 = y_v2.numpy()
+
+        return out_v1, out_v2
+
+    def new_api_static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype)
+            offset = paddle.static.data(
+                "offset",
+                (-1, 2 * self.filter_shape[0] * self.filter_shape[1], -1, -1),
+                dtype=self.dtype)
+            mask = paddle.static.data(
+                "mask",
+                (-1, self.filter_shape[0] * self.filter_shape[1], -1, -1),
+                dtype=self.dtype)
+
+            weight = paddle.static.data(
+                "weight", list(self.weight.shape), dtype=self.dtype)
+
+            if not self.no_bias:
+                bias = paddle.static.data("bias", [-1], dtype=self.dtype)
+
+            y_v1 = paddle.vision.ops.deform_conv2d(
+                x=x,
+                offset=offset,
+                weight=weight,
+                bias=None if self.no_bias else bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups, )
+
+            y_v2 = paddle.vision.ops.deform_conv2d(
+                x=x,
+                offset=offset,
+                mask=mask,
+                weight=weight,
+                bias=None if self.no_bias else bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups, )
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        feed_dict = {
+            "input": self.input,
+            "offset": self.offset,
+            "mask": self.mask,
+            "weight": self.weight
+        }
+        if not self.no_bias:
+            feed_dict["bias"] = self.bias
+
+        out_v1, out_v2 = exe.run(main, feed=feed_dict, fetch_list=[y_v1, y_v2])
+        return out_v1, out_v2
+
+    def _test_identity(self):
+        self.prepare()
+        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
+        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
+        new_static_dcn_v1, new_static_dcn_v2 = self.new_api_static_graph_case_dcn(
+        )
+        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
+        np.testing.assert_array_almost_equal(static_dcn_v1, new_static_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, new_static_dcn_v2)
+
+    def test_identity(self):
+        self.place = paddle.CPUPlace()
+        self._test_identity()
+
+        if paddle.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+            self._test_identity()
+
+
+# testcases for DeformConv2D
+class TestDeformConv2DWithPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = True
+
+
+class TestDeformConv2DWithBias(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDilation(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [3, 3]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithStride(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithGroups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 5
+        self.no_bias = False
+
+
+# testcases for deform_conv2d
+class TestDeformConv2DFunctionalWithPadding(TestDeformConv2DFunctional):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = True
+
+
+class TestDeformConv2DFunctionalWithBias(TestDeformConv2DFunctional):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DFunctionalWithAsynPadding(TestDeformConv2DFunctional):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DFunctionalWithDilation(TestDeformConv2DFunctional):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [3, 3]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DFunctionalWithStride(TestDeformConv2DFunctional):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DFunctionalWithGroups(TestDeformConv2DFunctional):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 5
+        self.no_bias = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 1fd0b1d717c..4b4e2088708 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -16,10 +16,13 @@ import numpy as np
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
+from ..fluid.layers import nn, utils
+from ..nn import Layer
+from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
 
-__all__ = ['yolo_loss', 'yolo_box']
+__all__ = ['yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D']
 
 
 def yolo_loss(x,
@@ -386,3 +389,387 @@ def yolo_box(x,
         },
         attrs=attrs)
     return boxes, scores
+
+
+def deform_conv2d(x,
+                  offset,
+                  weight,
+                  bias=None,
+                  stride=1,
+                  padding=0,
+                  dilation=1,
+                  groups=1,
+                  mask=None,
+                  name=None):
+    r"""
+    Compute 2-D deformable convolution on 4-D input.
+    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
+
+
+    Deformable Convolution v2:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
+
+    Deformable Convolution v1:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}
+
+    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
+    Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
+    <https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
+
+    Example:
+        - Input:
+
+          x shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          weight shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+          offset shape: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`
+
+          mask shape: :math:`(N, H_f * W_f, H_{out}, W_{out})`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        x (Tensor): The input image with [N, C, H, W] format. A Tensor with type
+            float32, float64.
+        offset (Tensor): The input coordinate offset of deformable convolution layer.
+            A Tensor with type float32, float64.
+        weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is
+            the number of output channels, g is the number of groups, kH is the filter's
+            height, kW is the filter's width.
+        bias (Tensor, optional): The bias with shape [M,].
+        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int, optonal): The groups number of the deformable conv layer. According to
+            grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        mask (Tensor, optional): The input mask of deformable convolution layer.
+            A Tensor with type float32, float64. It should be None when you use
+            deformable convolution v1.
+        name(str, optional): For details, please refer to :ref:`api_guide_Name`.
+                        Generally, no setting is required. Default: None.
+    Returns:
+        Tensor: The tensor variable storing the deformable convolution \
+                  result. A Tensor with type float32, float64.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+    Examples:
+        .. code-block:: python
+
+          #deformable conv v2:
+
+          import paddle
+          input = paddle.rand((8, 1, 28, 28))
+          kh, kw = 3, 3
+          weight = paddle.rand((16, 1, kh, kw))
+          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+          # mask shape should be [bs, hw * hw, out_h, out_w]
+          # In this case, for an input of 28, stride of 1
+          # and kernel size of 3, without padding, the output size is 26
+          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+          mask = paddle.rand((8, kh * kw, 26, 26))
+          out = paddle.vision.ops.deform_conv2d(input, offset, weight, mask=mask)
+          print(out.shape)
+          # returns
+          [8, 16, 26, 26]
+
+          #deformable conv v1:
+
+          import paddle
+          input = paddle.rand((8, 1, 28, 28))
+          kh, kw = 3, 3
+          weight = paddle.rand((16, 1, kh, kw))
+          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+          # In this case, for an input of 28, stride of 1
+          # and kernel size of 3, without padding, the output size is 26
+          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+          out = paddle.vision.ops.deform_conv2d(input, offset, weight)
+          print(out.shape)
+          # returns
+          [8, 16, 26, 26]
+    """
+    stride = utils.convert_to_list(stride, 2, 'stride')
+    padding = utils.convert_to_list(padding, 2, 'padding')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
+
+    use_deform_conv2d_v1 = True if mask is None else False
+
+    if in_dygraph_mode():
+        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
+                 'groups', groups, 'im2col_step', 1)
+        if use_deform_conv2d_v1:
+            op_type = 'deformable_conv_v1'
+            pre_bias = getattr(core.ops, op_type)(x, offset, weight, *attrs)
+        else:
+            op_type = 'deformable_conv'
+            pre_bias = getattr(core.ops, op_type)(x, offset, mask, weight,
+                                                  *attrs)
+        if bias is not None:
+            out = nn.elementwise_add(pre_bias, bias, axis=1)
+        else:
+            out = pre_bias
+    else:
+        check_variable_and_dtype(x, "x", ['float32', 'float64'],
+                                 'deform_conv2d')
+        check_variable_and_dtype(offset, "offset", ['float32', 'float64'],
+                                 'deform_conv2d')
+
+        num_channels = x.shape[1]
+
+        helper = LayerHelper('deformable_conv', **locals())
+        dtype = helper.input_dtype()
+
+        stride = utils.convert_to_list(stride, 2, 'stride')
+        padding = utils.convert_to_list(padding, 2, 'padding')
+        dilation = utils.convert_to_list(dilation, 2, 'dilation')
+
+        pre_bias = helper.create_variable_for_type_inference(dtype)
+
+        if use_deform_conv2d_v1:
+            op_type = 'deformable_conv_v1'
+            inputs = {
+                'Input': x,
+                'Filter': weight,
+                'Offset': offset,
+            }
+        else:
+            op_type = 'deformable_conv'
+            inputs = {
+                'Input': x,
+                'Filter': weight,
+                'Offset': offset,
+                'Mask': mask,
+            }
+
+        outputs = {"Output": pre_bias}
+        attrs = {
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'deformable_groups': 1,
+            'im2col_step': 1,
+        }
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+        if bias is not None:
+            out = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [bias]},
+                outputs={'Out': [out]},
+                attrs={'axis': 1})
+        else:
+            out = pre_bias
+    return out
+
+
+class DeformConv2D(Layer):
+    r"""
+    Compute 2-D deformable convolution on 4-D input.
+    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
+
+
+    Deformable Convolution v2:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
+
+    Deformable Convolution v1:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}
+
+    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
+    Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
+    <https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
+
+    Example:
+        - Input:
+
+          x shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          weight shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+          offset shape: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`
+
+          mask shape: :math:`(N, H_f * W_f, H_{out}, W_{out})`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+
+    Parameters:
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
+        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3D Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. The default value is 1.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. The default value is None.
+    Attribute:
+        **weight** (Parameter): the learnable weights of filter of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Shape:
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - offset: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})`
+        - mask: :math:`(N, H_f * W_f, H_{out}, W_{out})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        ..  math::
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+    Examples:
+        .. code-block:: python
+
+          #deformable conv v2:
+
+          import paddle
+          input = paddle.rand((8, 1, 28, 28))
+          kh, kw = 3, 3
+          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+          # mask shape should be [bs, hw * hw, out_h, out_w]
+          # In this case, for an input of 28, stride of 1
+          # and kernel size of 3, without padding, the output size is 26
+          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+          mask = paddle.rand((8, kh * kw, 26, 26))
+          deform_conv = paddle.vision.ops.DeformConv2D(
+              in_channels=1,
+              out_channels=16,
+              kernel_size=[kh, kw])
+          out = deform_conv(input, offset, mask)
+          print(out.shape)
+          # returns
+          [8, 16, 26, 26]
+
+          #deformable conv v1:
+
+          import paddle
+          input = paddle.rand((8, 1, 28, 28))
+          kh, kw = 3, 3
+          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+          # mask shape should be [bs, hw * hw, out_h, out_w]
+          # In this case, for an input of 28, stride of 1
+          # and kernel size of 3, without padding, the output size is 26
+          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+          deform_conv = paddle.vision.ops.DeformConv2D(
+              in_channels=1,
+              out_channels=16,
+              kernel_size=[kh, kw])
+          out = deform_conv(input, offset)
+          print(out.shape)
+          # returns
+          [8, 16, 26, 26]
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformConv2D, self).__init__()
+        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self._groups = groups
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._channel_dim = 1
+
+        self._stride = utils.convert_to_list(stride, 2, 'stride')
+        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
+        self._kernel_size = utils.convert_to_list(kernel_size, 2, 'kernel_size')
+
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups.")
+
+        self._padding = utils.convert_to_list(padding, 2, 'padding')
+
+        filter_shape = [out_channels, in_channels // groups] + self._kernel_size
+
+        def _get_default_param_initializer():
+            filter_elem_num = np.prod(self._kernel_size) * self._in_channels
+            std = (2.0 / filter_elem_num)**0.5
+            return Normal(0.0, std, 0)
+
+        self.weight = self.create_parameter(
+            shape=filter_shape,
+            attr=self._weight_attr,
+            default_initializer=_get_default_param_initializer())
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
+
+    def forward(self, x, offset, mask=None):
+        out = deform_conv2d(
+            x=x,
+            offset=offset,
+            weight=self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._padding,
+            dilation=self._dilation,
+            groups=self._groups,
+            mask=mask)
+        return out
-- 
GitLab