diff --git a/python/paddle/fluid/tests/unittests/test_deform_conv2d.py b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..660625c9bf75613f59799b778d95dda246e34a79 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_deform_conv2d.py @@ -0,0 +1,558 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn.functional as F +import paddle.nn.initializer as I +import numpy as np +import unittest +from unittest import TestCase + + +class TestDeformConv2D(TestCase): + batch_size = 4 + spatial_shape = (16, 16) + dtype = "float32" + + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [0, 0] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = True + + def prepare(self): + if isinstance(self.kernel_size, int): + filter_shape = (self.kernel_size, ) * 2 + else: + filter_shape = tuple(self.kernel_size) + self.filter_shape = filter_shape + + self.weight = np.random.uniform( + -1, 1, (self.out_channels, self.in_channels // self.groups + ) + filter_shape).astype(self.dtype) + if not self.no_bias: + self.bias = np.random.uniform(-1, 1, ( + self.out_channels, )).astype(self.dtype) + + def out_size(in_size, pad_size, dilation_size, kernel_size, + stride_size): + return (in_size + 2 * pad_size - + (dilation_size * (kernel_size - 1) + 1)) / stride_size + 1 + + out_h = int( + out_size(self.spatial_shape[0], self.padding[0], self.dilation[0], + self.kernel_size[0], self.stride[0])) + out_w = int( + out_size(self.spatial_shape[1], self.padding[1], self.dilation[1], + self.kernel_size[1], self.stride[1])) + out_shape = (out_h, out_w) + + self.input_shape = (self.batch_size, self.in_channels + ) + self.spatial_shape + + self.offset_shape = (self.batch_size, 2 * filter_shape[0] * + filter_shape[1]) + out_shape + + self.mask_shape = (self.batch_size, filter_shape[0] * filter_shape[1] + ) + out_shape + + self.input = np.random.uniform(-1, 1, + self.input_shape).astype(self.dtype) + + self.offset = np.random.uniform(-1, 1, + self.offset_shape).astype(self.dtype) + + self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype) + + def static_graph_case_dcn(self): + main = paddle.static.Program() + start = paddle.static.Program() + paddle.enable_static() + with paddle.static.program_guard(main, start): + x = paddle.static.data( + "input", (-1, self.in_channels, -1, -1), dtype=self.dtype) + offset = paddle.static.data( + "offset", + (-1, 2 * self.filter_shape[0] * self.filter_shape[1], -1, -1), + dtype=self.dtype) + mask = paddle.static.data( + "mask", + (-1, self.filter_shape[0] * self.filter_shape[1], -1, -1), + dtype=self.dtype) + + y_v1 = paddle.fluid.layers.deformable_conv( + input=x, + offset=offset, + mask=None, + num_filters=self.out_channels, + filter_size=self.filter_shape, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + deformable_groups=1, + im2col_step=1, + param_attr=I.Assign(self.weight), + bias_attr=False if self.no_bias else I.Assign(self.bias), + modulated=False) + + y_v2 = paddle.fluid.layers.deformable_conv( + input=x, + offset=offset, + mask=mask, + num_filters=self.out_channels, + filter_size=self.filter_shape, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + deformable_groups=1, + im2col_step=1, + param_attr=I.Assign(self.weight), + bias_attr=False if self.no_bias else I.Assign(self.bias)) + + exe = paddle.static.Executor(self.place) + exe.run(start) + out_v1, out_v2 = exe.run(main, + feed={ + "input": self.input, + "offset": self.offset, + "mask": self.mask + }, + fetch_list=[y_v1, y_v2]) + return out_v1, out_v2 + + def dygraph_case_dcn(self): + paddle.disable_static() + x = paddle.to_tensor(self.input) + offset = paddle.to_tensor(self.offset) + mask = paddle.to_tensor(self.mask) + + bias = None if self.no_bias else paddle.to_tensor(self.bias) + + deform_conv2d = paddle.vision.ops.DeformConv2D( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + weight_attr=I.Assign(self.weight), + bias_attr=False if self.no_bias else I.Assign(self.bias)) + + y_v1 = deform_conv2d(x, offset) + y_v2 = deform_conv2d(x, offset, mask) + + out_v1 = y_v1.numpy() + out_v2 = y_v2.numpy() + + return out_v1, out_v2 + + def _test_identity(self): + self.prepare() + static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn() + dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn() + np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1) + np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2) + + def test_identity(self): + self.place = paddle.CPUPlace() + self._test_identity() + + if paddle.is_compiled_with_cuda(): + self.place = paddle.CUDAPlace(0) + self._test_identity() + + +class TestDeformConv2DFunctional(TestCase): + batch_size = 4 + spatial_shape = (16, 16) + dtype = "float32" + + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [0, 0] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = True + + def prepare(self): + if isinstance(self.kernel_size, int): + filter_shape = (self.kernel_size, ) * 2 + else: + filter_shape = tuple(self.kernel_size) + self.filter_shape = filter_shape + + self.weight = np.random.uniform( + -1, 1, (self.out_channels, self.in_channels // self.groups + ) + filter_shape).astype(self.dtype) + if not self.no_bias: + self.bias = np.random.uniform(-1, 1, ( + self.out_channels, )).astype(self.dtype) + + def out_size(in_size, pad_size, dilation_size, kernel_size, + stride_size): + return (in_size + 2 * pad_size - + (dilation_size * (kernel_size - 1) + 1)) / stride_size + 1 + + out_h = int( + out_size(self.spatial_shape[0], self.padding[0], self.dilation[0], + self.kernel_size[0], self.stride[0])) + out_w = int( + out_size(self.spatial_shape[1], self.padding[1], self.dilation[1], + self.kernel_size[1], self.stride[1])) + out_shape = (out_h, out_w) + + self.input_shape = (self.batch_size, self.in_channels + ) + self.spatial_shape + + self.offset_shape = (self.batch_size, 2 * filter_shape[0] * + filter_shape[1]) + out_shape + + self.mask_shape = (self.batch_size, filter_shape[0] * filter_shape[1] + ) + out_shape + + self.input = np.random.uniform(-1, 1, + self.input_shape).astype(self.dtype) + + self.offset = np.random.uniform(-1, 1, + self.offset_shape).astype(self.dtype) + + self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype) + + def static_graph_case_dcn(self): + main = paddle.static.Program() + start = paddle.static.Program() + paddle.enable_static() + with paddle.static.program_guard(main, start): + x = paddle.static.data( + "input", (-1, self.in_channels, -1, -1), dtype=self.dtype) + offset = paddle.static.data( + "offset", + (-1, 2 * self.filter_shape[0] * self.filter_shape[1], -1, -1), + dtype=self.dtype) + mask = paddle.static.data( + "mask", + (-1, self.filter_shape[0] * self.filter_shape[1], -1, -1), + dtype=self.dtype) + + y_v1 = paddle.fluid.layers.deformable_conv( + input=x, + offset=offset, + mask=None, + num_filters=self.out_channels, + filter_size=self.filter_shape, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + deformable_groups=1, + im2col_step=1, + param_attr=I.Assign(self.weight), + bias_attr=False if self.no_bias else I.Assign(self.bias), + modulated=False) + + y_v2 = paddle.fluid.layers.deformable_conv( + input=x, + offset=offset, + mask=mask, + num_filters=self.out_channels, + filter_size=self.filter_shape, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, + deformable_groups=1, + im2col_step=1, + param_attr=I.Assign(self.weight), + bias_attr=False if self.no_bias else I.Assign(self.bias)) + + exe = paddle.static.Executor(self.place) + exe.run(start) + out_v1, out_v2 = exe.run(main, + feed={ + "input": self.input, + "offset": self.offset, + "mask": self.mask + }, + fetch_list=[y_v1, y_v2]) + return out_v1, out_v2 + + def dygraph_case_dcn(self): + paddle.disable_static() + x = paddle.to_tensor(self.input) + offset = paddle.to_tensor(self.offset) + mask = paddle.to_tensor(self.mask) + weight = paddle.to_tensor(self.weight) + bias = None if self.no_bias else paddle.to_tensor(self.bias) + + y_v1 = paddle.vision.ops.deform_conv2d( + x=x, + offset=offset, + weight=weight, + bias=bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, ) + + y_v2 = paddle.vision.ops.deform_conv2d( + x=x, + offset=offset, + mask=mask, + weight=weight, + bias=bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, ) + + out_v1 = y_v1.numpy() + out_v2 = y_v2.numpy() + + return out_v1, out_v2 + + def new_api_static_graph_case_dcn(self): + main = paddle.static.Program() + start = paddle.static.Program() + paddle.enable_static() + with paddle.static.program_guard(main, start): + x = paddle.static.data( + "input", (-1, self.in_channels, -1, -1), dtype=self.dtype) + offset = paddle.static.data( + "offset", + (-1, 2 * self.filter_shape[0] * self.filter_shape[1], -1, -1), + dtype=self.dtype) + mask = paddle.static.data( + "mask", + (-1, self.filter_shape[0] * self.filter_shape[1], -1, -1), + dtype=self.dtype) + + weight = paddle.static.data( + "weight", list(self.weight.shape), dtype=self.dtype) + + if not self.no_bias: + bias = paddle.static.data("bias", [-1], dtype=self.dtype) + + y_v1 = paddle.vision.ops.deform_conv2d( + x=x, + offset=offset, + weight=weight, + bias=None if self.no_bias else bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, ) + + y_v2 = paddle.vision.ops.deform_conv2d( + x=x, + offset=offset, + mask=mask, + weight=weight, + bias=None if self.no_bias else bias, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + groups=self.groups, ) + + exe = paddle.static.Executor(self.place) + exe.run(start) + feed_dict = { + "input": self.input, + "offset": self.offset, + "mask": self.mask, + "weight": self.weight + } + if not self.no_bias: + feed_dict["bias"] = self.bias + + out_v1, out_v2 = exe.run(main, feed=feed_dict, fetch_list=[y_v1, y_v2]) + return out_v1, out_v2 + + def _test_identity(self): + self.prepare() + static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn() + dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn() + new_static_dcn_v1, new_static_dcn_v2 = self.new_api_static_graph_case_dcn( + ) + np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1) + np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2) + np.testing.assert_array_almost_equal(static_dcn_v1, new_static_dcn_v1) + np.testing.assert_array_almost_equal(static_dcn_v2, new_static_dcn_v2) + + def test_identity(self): + self.place = paddle.CPUPlace() + self._test_identity() + + if paddle.is_compiled_with_cuda(): + self.place = paddle.CUDAPlace(0) + self._test_identity() + + +# testcases for DeformConv2D +class TestDeformConv2DWithPadding(TestDeformConv2D): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [2, 2] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = True + + +class TestDeformConv2DWithBias(TestDeformConv2D): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [2, 2] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DWithAsynPadding(TestDeformConv2D): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 2] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DWithDilation(TestDeformConv2D): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 1] + self.stride = [1, 1] + self.dilation = [3, 3] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DWithStride(TestDeformConv2D): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 1] + self.stride = [2, 2] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DWithGroups(TestDeformConv2D): + def setUp(self): + self.in_channels = 5 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 1] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 5 + self.no_bias = False + + +# testcases for deform_conv2d +class TestDeformConv2DFunctionalWithPadding(TestDeformConv2DFunctional): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [2, 2] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = True + + +class TestDeformConv2DFunctionalWithBias(TestDeformConv2DFunctional): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [2, 2] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DFunctionalWithAsynPadding(TestDeformConv2DFunctional): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 2] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DFunctionalWithDilation(TestDeformConv2DFunctional): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 1] + self.stride = [1, 1] + self.dilation = [3, 3] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DFunctionalWithStride(TestDeformConv2DFunctional): + def setUp(self): + self.in_channels = 3 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 1] + self.stride = [2, 2] + self.dilation = [1, 1] + self.groups = 1 + self.no_bias = False + + +class TestDeformConv2DFunctionalWithGroups(TestDeformConv2DFunctional): + def setUp(self): + self.in_channels = 5 + self.out_channels = 5 + self.kernel_size = [3, 3] + self.padding = [1, 1] + self.stride = [1, 1] + self.dilation = [1, 1] + self.groups = 5 + self.no_bias = False + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 1fd0b1d717cefba3e7472405101198e719626207..4b4e2088708bb955d5c7811473b49baf996a4ef3 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -16,10 +16,13 @@ import numpy as np from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import core, layers +from ..fluid.layers import nn, utils +from ..nn import Layer +from ..fluid.initializer import Normal from paddle.common_ops_import import * -__all__ = ['yolo_loss', 'yolo_box'] +__all__ = ['yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D'] def yolo_loss(x, @@ -386,3 +389,387 @@ def yolo_box(x, }, attrs=attrs) return boxes, scores + + +def deform_conv2d(x, + offset, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + mask=None, + name=None): + r""" + Compute 2-D deformable convolution on 4-D input. + Given input image x, output feature map y, the deformable convolution operation can be expressed as follow: + + + Deformable Convolution v2: + + .. math:: + + y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k} + + Deformable Convolution v1: + + .. math:: + + y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)} + + Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location, + Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results + `_ and `Deformable Convolutional Networks `_. + + Example: + - Input: + + x shape: :math:`(N, C_{in}, H_{in}, W_{in})` + + weight shape: :math:`(C_{out}, C_{in}, H_f, W_f)` + + offset shape: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})` + + mask shape: :math:`(N, H_f * W_f, H_{out}, W_{out})` + + - Output: + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + + Args: + x (Tensor): The input image with [N, C, H, W] format. A Tensor with type + float32, float64. + offset (Tensor): The input coordinate offset of deformable convolution layer. + A Tensor with type float32, float64. + weight (Tensor): The convolution kernel with shape [M, C/g, kH, kW], where M is + the number of output channels, g is the number of groups, kH is the filter's + height, kW is the filter's width. + bias (Tensor, optional): The bias with shape [M,]. + stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int, optonal): The groups number of the deformable conv layer. According to + grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1. + mask (Tensor, optional): The input mask of deformable convolution layer. + A Tensor with type float32, float64. It should be None when you use + deformable convolution v1. + name(str, optional): For details, please refer to :ref:`api_guide_Name`. + Generally, no setting is required. Default: None. + Returns: + Tensor: The tensor variable storing the deformable convolution \ + result. A Tensor with type float32, float64. + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + Examples: + .. code-block:: python + + #deformable conv v2: + + import paddle + input = paddle.rand((8, 1, 28, 28)) + kh, kw = 3, 3 + weight = paddle.rand((16, 1, kh, kw)) + # offset shape should be [bs, 2 * kh * kw, out_h, out_w] + # mask shape should be [bs, hw * hw, out_h, out_w] + # In this case, for an input of 28, stride of 1 + # and kernel size of 3, without padding, the output size is 26 + offset = paddle.rand((8, 2 * kh * kw, 26, 26)) + mask = paddle.rand((8, kh * kw, 26, 26)) + out = paddle.vision.ops.deform_conv2d(input, offset, weight, mask=mask) + print(out.shape) + # returns + [8, 16, 26, 26] + + #deformable conv v1: + + import paddle + input = paddle.rand((8, 1, 28, 28)) + kh, kw = 3, 3 + weight = paddle.rand((16, 1, kh, kw)) + # offset shape should be [bs, 2 * kh * kw, out_h, out_w] + # In this case, for an input of 28, stride of 1 + # and kernel size of 3, without padding, the output size is 26 + offset = paddle.rand((8, 2 * kh * kw, 26, 26)) + out = paddle.vision.ops.deform_conv2d(input, offset, weight) + print(out.shape) + # returns + [8, 16, 26, 26] + """ + stride = utils.convert_to_list(stride, 2, 'stride') + padding = utils.convert_to_list(padding, 2, 'padding') + dilation = utils.convert_to_list(dilation, 2, 'dilation') + + use_deform_conv2d_v1 = True if mask is None else False + + if in_dygraph_mode(): + attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation, + 'groups', groups, 'im2col_step', 1) + if use_deform_conv2d_v1: + op_type = 'deformable_conv_v1' + pre_bias = getattr(core.ops, op_type)(x, offset, weight, *attrs) + else: + op_type = 'deformable_conv' + pre_bias = getattr(core.ops, op_type)(x, offset, mask, weight, + *attrs) + if bias is not None: + out = nn.elementwise_add(pre_bias, bias, axis=1) + else: + out = pre_bias + else: + check_variable_and_dtype(x, "x", ['float32', 'float64'], + 'deform_conv2d') + check_variable_and_dtype(offset, "offset", ['float32', 'float64'], + 'deform_conv2d') + + num_channels = x.shape[1] + + helper = LayerHelper('deformable_conv', **locals()) + dtype = helper.input_dtype() + + stride = utils.convert_to_list(stride, 2, 'stride') + padding = utils.convert_to_list(padding, 2, 'padding') + dilation = utils.convert_to_list(dilation, 2, 'dilation') + + pre_bias = helper.create_variable_for_type_inference(dtype) + + if use_deform_conv2d_v1: + op_type = 'deformable_conv_v1' + inputs = { + 'Input': x, + 'Filter': weight, + 'Offset': offset, + } + else: + op_type = 'deformable_conv' + inputs = { + 'Input': x, + 'Filter': weight, + 'Offset': offset, + 'Mask': mask, + } + + outputs = {"Output": pre_bias} + attrs = { + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'deformable_groups': 1, + 'im2col_step': 1, + } + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + + if bias is not None: + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [bias]}, + outputs={'Out': [out]}, + attrs={'axis': 1}) + else: + out = pre_bias + return out + + +class DeformConv2D(Layer): + r""" + Compute 2-D deformable convolution on 4-D input. + Given input image x, output feature map y, the deformable convolution operation can be expressed as follow: + + + Deformable Convolution v2: + + .. math:: + + y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k} + + Deformable Convolution v1: + + .. math:: + + y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)} + + Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location, + Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results + `_ and `Deformable Convolutional Networks `_. + + Example: + - Input: + + x shape: :math:`(N, C_{in}, H_{in}, W_{in})` + + weight shape: :math:`(C_{out}, C_{in}, H_f, W_f)` + + offset shape: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})` + + mask shape: :math:`(N, H_f * W_f, H_{out}, W_{out})` + + - Output: + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + + + Parameters: + in_channels(int): The number of input channels in the input image. + out_channels(int): The number of output channels produced by the convolution. + kernel_size(int|list|tuple): The size of the convolving kernel. + stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must + contain three integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. The default value is 1. + padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. The default value is 1. + groups(int, optional): The groups number of the Conv3D Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. The default value is 1. + weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights + of conv2d. If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None. + bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. The default value is None. + Attribute: + **weight** (Parameter): the learnable weights of filter of this layer. + **bias** (Parameter or None): the learnable bias of this layer. + Shape: + - x: :math:`(N, C_{in}, H_{in}, W_{in})` + - offset: :math:`(N, 2 * H_f * W_f, H_{out}, W_{out})` + - mask: :math:`(N, H_f * W_f, H_{out}, W_{out})` + - output: :math:`(N, C_{out}, H_{out}, W_{out})` + Where + .. math:: + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1 + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1 + Examples: + .. code-block:: python + + #deformable conv v2: + + import paddle + input = paddle.rand((8, 1, 28, 28)) + kh, kw = 3, 3 + # offset shape should be [bs, 2 * kh * kw, out_h, out_w] + # mask shape should be [bs, hw * hw, out_h, out_w] + # In this case, for an input of 28, stride of 1 + # and kernel size of 3, without padding, the output size is 26 + offset = paddle.rand((8, 2 * kh * kw, 26, 26)) + mask = paddle.rand((8, kh * kw, 26, 26)) + deform_conv = paddle.vision.ops.DeformConv2D( + in_channels=1, + out_channels=16, + kernel_size=[kh, kw]) + out = deform_conv(input, offset, mask) + print(out.shape) + # returns + [8, 16, 26, 26] + + #deformable conv v1: + + import paddle + input = paddle.rand((8, 1, 28, 28)) + kh, kw = 3, 3 + # offset shape should be [bs, 2 * kh * kw, out_h, out_w] + # mask shape should be [bs, hw * hw, out_h, out_w] + # In this case, for an input of 28, stride of 1 + # and kernel size of 3, without padding, the output size is 26 + offset = paddle.rand((8, 2 * kh * kw, 26, 26)) + deform_conv = paddle.vision.ops.DeformConv2D( + in_channels=1, + out_channels=16, + kernel_size=[kh, kw]) + out = deform_conv(input, offset) + print(out.shape) + # returns + [8, 16, 26, 26] + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + weight_attr=None, + bias_attr=None): + super(DeformConv2D, self).__init__() + assert weight_attr is not False, "weight_attr should not be False in Conv." + self._weight_attr = weight_attr + self._bias_attr = bias_attr + self._groups = groups + self._in_channels = in_channels + self._out_channels = out_channels + self._channel_dim = 1 + + self._stride = utils.convert_to_list(stride, 2, 'stride') + self._dilation = utils.convert_to_list(dilation, 2, 'dilation') + self._kernel_size = utils.convert_to_list(kernel_size, 2, 'kernel_size') + + if in_channels % groups != 0: + raise ValueError("in_channels must be divisible by groups.") + + self._padding = utils.convert_to_list(padding, 2, 'padding') + + filter_shape = [out_channels, in_channels // groups] + self._kernel_size + + def _get_default_param_initializer(): + filter_elem_num = np.prod(self._kernel_size) * self._in_channels + std = (2.0 / filter_elem_num)**0.5 + return Normal(0.0, std, 0) + + self.weight = self.create_parameter( + shape=filter_shape, + attr=self._weight_attr, + default_initializer=_get_default_param_initializer()) + self.bias = self.create_parameter( + attr=self._bias_attr, shape=[self._out_channels], is_bias=True) + + def forward(self, x, offset, mask=None): + out = deform_conv2d( + x=x, + offset=offset, + weight=self.weight, + bias=self.bias, + stride=self._stride, + padding=self._padding, + dilation=self._dilation, + groups=self._groups, + mask=mask) + return out