From c50fb58c8eb662eda4e3a66bc5633a83ece61446 Mon Sep 17 00:00:00 2001 From: cjt222 <806512756@qq.com> Date: Tue, 18 Jun 2019 06:39:18 +0800 Subject: [PATCH] test=release/1.5 (#18134) cherry pick for deform roi pooling --- paddle/fluid/API.spec | 1 + .../operators/deformable_psroi_pooling_op.cc | 270 +++++++++ .../operators/deformable_psroi_pooling_op.cu | 523 ++++++++++++++++++ .../operators/deformable_psroi_pooling_op.h | 479 ++++++++++++++++ python/paddle/fluid/layers/nn.py | 115 ++++ .../test_deformable_psroi_pooling.py | 369 ++++++++++++ .../fluid/tests/unittests/test_layers.py | 29 + 7 files changed, 1786 insertions(+) create mode 100644 paddle/fluid/operators/deformable_psroi_pooling_op.cc create mode 100644 paddle/fluid/operators/deformable_psroi_pooling_op.cu create mode 100644 paddle/fluid/operators/deformable_psroi_pooling_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index f4130f03a4..29bf80270f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -238,6 +238,7 @@ paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_c paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=None, defaults=None), ('document', '3126e3039e752ce26077f1efaca355c6')) paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'ccf6bb7912afd2818d24bc45461e807a')) paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', 'c896b66265a60bd3c5510f66e6e02919')) +paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '65b8dbe13e00c4dc8224652f6ff89540')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9e87163ba32003f21d2c9d8c6a605ada')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'dce69a78638da8f7ad80b1fc00ed2029')) paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6')) diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc new file mode 100644 index 0000000000..d17f22b9b4 --- /dev/null +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc @@ -0,0 +1,270 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" +#include +#include +#include +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { +class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor), " + "the input of Deformable PSROIPooling. " + "The shape of input tensor is [N,C,H,W]. Where N is batch size, " + "C is number of input channels, " + "H is height of the feature, and " + "W is the width of the feature."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [[x1, y1, x2, y2], ...]. " + "(x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates."); + AddInput("Trans", + "(Tensor)," + "offset of features on ROIs while pooling. " + "The format is NCHW, where N is number of ROIs, " + "C is number of channels, which indicate the offset distance " + "in the x and y directions, " + "H is pooled height, and " + "W is pooled width."); + AddAttr("no_trans", + "(bool), " + "whether add offset to get new value or not while roi " + "pooling, which value is True or False"); + AddAttr("spatial_scale", + "(float), " + "ratio of input feature map height (or width) to " + "raw image height (or width). Equals the reciprocal " + "of total stride in convolutional layers."); + AddAttr("output_dim", + "(int), " + "the number of output channels, which should be less than " + "input channels. Deformable roi_pooling requires " + "output_channels = input_channels, while deformable " + "psroi_pooling requires output_channels = input_channels " + "* pooled_height * pooled_width"); + AddAttr>( + "group_size", + "(vector), " + "the number of groups which input channels are divided." + "(eg.number of input channels is k1*k2*(C+1), which k1 and k2 " + "are group width and height and C+1 is number of output " + "chanels. eg.(4, 6), which 4 is height of group and 6 is " + "width of group"); + AddAttr("pooled_height", + "(int), " + "the pooled output height."); + AddAttr("pooled_width", + "(int), " + "the pooled output width."); + AddAttr>( + "part_size", + "(vector), " + "the height and width of offset, eg.(4, 6), which height is 4 " + " and width is 6"); + AddAttr("sample_per_part", + "(int), " + "the number of samples in each bin"); + AddAttr("trans_std", + "(float), " + "Coefficient of offset"); + AddOutput("TopCount", + "(Tensor), " + "record the number of pixel in average pooling to in each bin. " + "The format is NCHW, where N is the number of ROIs, " + "C is the number of output channels, " + "H is the height of output, and " + "W is the width of output."); + AddOutput("Output", + "(Tensor), " + "the output of Deformable PSROIPooling. " + "The format is NCHW, where N is the number of ROIs, " + "C is the number of output channels, " + "H is the height of output, and " + "W is thewidth of output. "); + AddComment(R"DOC( +**DeformablePSROIPooling Operator** +DeformablePSROIPooling is a new method based Region of interest pooling +(also known as RoI pooling). +The operator has four steps: + +1. Dividing each region proposal into equal-sized sections with + the pooled_width and pooled_height. + +2. Add offset to pixel in ROI to get new location and the new value which are + computed directly through bilinear interpolation with four nearest pixel. + +3. Sample several points to get average values in each bin. + +4. Copying these average values to the output buffer. + +DeformablePSROIPooling is part of Deformable Convolutional Networks, +please refer to https://arxiv.org/abs/1703.06211 for more details. + )DOC"); + } +}; + +class DeformablePSROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of DeformablePSROIPoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of DeformablePSROIPoolOp " + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Trans"), + "Input(Trans) of DeformablePSROIPoolOp " + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of DeformablePSROIPoolOp " + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("TopCount"), + "Output(TopCount) of DeformablePSROIPoolOp " + "should not be null."); + auto input_dims = ctx->GetInputDim("Input"); + auto rois_dims = ctx->GetInputDim("ROIs"); + auto trans_dims = ctx->GetInputDim("Trans"); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" + "given as [[ x1, y1, x2, y2], ...]."); + PADDLE_ENFORCE(trans_dims.size() == 4, + "The format of Input Trans is (N, 2, H, W)."); + auto pooled_height = ctx->Attrs().Get("pooled_height"); + auto pooled_width = ctx->Attrs().Get("pooled_width"); + auto spatial_scale = ctx->Attrs().Get("spatial_scale"); + auto output_channels = ctx->Attrs().Get("output_dim"); + auto group_size = ctx->Attrs().Get>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto part_size = ctx->Attrs().Get>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx->Attrs().Get("sample_per_part"); + auto trans_std = ctx->Attrs().Get("trans_std"); + PADDLE_ENFORCE(trans_std >= 0.0f, "trans_std must greater than 0.0"); + PADDLE_ENFORCE(input_dims[1] >= output_channels, + "input channels must greater than out_channels"); + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, "The pooled width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0"); + PADDLE_ENFORCE_EQ(group_size.size(), 2, + "The size of group_size should be 2."); + PADDLE_ENFORCE_GT(group_height, 0, + "The group_height in group_size must greater than 0"); + PADDLE_ENFORCE_GT(group_width, 0, + "The group_width in group_size must greater than 0"); + PADDLE_ENFORCE_EQ(part_size.size(), 2, + "The size of part_size should be 2."); + PADDLE_ENFORCE_GT(part_height, 0, + "The part_height in part_size must greater than 0"); + PADDLE_ENFORCE_GT(part_width, 0, + "The part_width in part_size must greater than 0"); + PADDLE_ENFORCE(part_height <= trans_dims[2], + "The height of trans must greater than part_height"); + PADDLE_ENFORCE(part_width <= trans_dims[3], + "The width of trans must greater than part_width"); + PADDLE_ENFORCE_GT(sample_per_part, 0, + "The sample_per_part must greater than 0"); + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = output_channels; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + ctx->SetOutputDim("Output", out_dims); + ctx->SetOutputDim("TopCount", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; + +class DeformablePSROIPoolGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + + op->SetType("deformable_psroi_pooling_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Trans", Input("Trans")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput("TopCount", Output("TopCount")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Trans"), InputGrad("Trans")); + + op->SetAttrMap(Attrs()); + return op; + } +}; + +class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")), + "The gradient of Output should not be null."); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), + ctx->GetInputDim("Input")); + } + if (ctx->HasOutput(framework::GradVarName("Trans"))) { + ctx->SetOutputDim(framework::GradVarName("Trans"), + ctx->GetInputDim("Trans")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Trans")->type(), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; +REGISTER_OPERATOR(deformable_psroi_pooling, ops::DeformablePSROIPoolOp, + ops::DeformablePSROIPoolOpMaker, + ops::DeformablePSROIPoolGradOpDescMaker); +REGISTER_OPERATOR(deformable_psroi_pooling_grad, + ops::DeformablePSROIPoolGradOp); +REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling, + ops::DeformablePSROIPoolCPUKernel, + ops::DeformablePSROIPoolCPUKernel); +REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling_grad, + ops::DeformablePSROIPoolGradCPUKernel, + ops::DeformablePSROIPoolGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu new file mode 100644 index 0000000000..800ff5ef85 --- /dev/null +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -0,0 +1,523 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +static inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ T bilinear_interpolation(const T* data, const T x, const T y, + const int width, const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, const T* bottom_data, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const T* bottom_rois, + const T* bottom_trans, const bool no_trans, const T trans_std, + const int sample_per_part, const int output_dim, const int group_height, + const int group_width, const int part_height, const int part_width, + const int num_classes, const int channels_each_class, T* top_data, + T* top_count, int* roi_batch_id_data) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = roi_batch_id_data[n]; + + // location of roi on feature map + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval ineach bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_width); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = min(max(gw, 0), group_width - 1); + gh = min(max(gh, 0), group_height - 1); + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels) * height * width; + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_height + gh) * group_width + gw; + // bilinear interpolation + T val = bilinear_interpolation(offset_bottom_data + c * height * width, + w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + const LoDTensor* rois = ctx.Input("ROIs"); + const Tensor* trans = ctx.Input("Trans"); + Tensor* out = ctx.Output("Output"); + out->mutable_data(ctx.GetPlace()); + Tensor* top_count = ctx.Output("TopCount"); + top_count->mutable_data(ctx.GetPlace()); + + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + const int batch = static_cast(input->dims()[0]); + const int channels = static_cast(input->dims()[1]); + const int height = static_cast(input->dims()[2]); + const int width = static_cast(input->dims()[3]); + const int channels_trans = no_trans ? 2 : trans->dims()[1]; + const int num_rois = rois->dims()[0]; + PADDLE_ENFORCE_EQ(num_rois, out->dims()[0], + "number of rois should be same with number of output"); + const int count = num_rois * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + PADDLE_ENFORCE(channels_each_class >= 1, + "channels_each must greater than 1"); + + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch, + "The rois_batch_size and imgs batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); + + T* top_data = out->mutable_data(ctx.GetPlace()); + T* top_count_data = top_count->mutable_data(ctx.GetPlace()); + + DeformablePSROIPoolForwardKernel<<>>( + count, bottom_data, (T)spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + (T)trans_std, sample_per_part, output_dim, group_height, group_width, + part_height, part_width, num_classes, channels_each_class, top_data, + top_count_data, roi_id_data); + } +}; + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, const T* top_diff, const T* top_count, const int num_rois, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int output_dim, T* bottom_data_diff, T* bottom_trans_diff, + const T* bottom_data, const T* bottom_rois, const T* bottom_trans, + const bool no_trans, const T trans_std, const int sample_per_part, + const int group_height, const int group_width, const int part_height, + const int part_width, const int num_classes, const int channels_each_class, + int* roi_batch_id_data) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + int num_box = count / pooled_height / pooled_width / output_dim; + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = roi_batch_id_data[n]; + + // location of roi on feature map + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = max(roi_end_w - roi_start_w, 0.1); + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval in each bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_width); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + + T diff_val = top_diff[index] / top_count[index]; + const T* offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = min(max(gw, 0), group_width - 1); + gh = min(max(gh, 0), group_height - 1); + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_height + gh) * group_width + gw; + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + + // compute coefficient of gradient + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + + // compute gradient of input + if (bottom_data_diff) { + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x0, + q00 * diff_val); + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x0, + q01 * diff_val); + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x1, + q10 * diff_val); + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x1, + q11 * diff_val); + } + + // compute gradient of trans + if (no_trans || bottom_trans_diff == NULL) { + continue; + } + + T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y - + u00 * (1 - dist_y)) * + trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x - + u00 * (1 - dist_x)) * + trans_std * diff_val; + diff_y *= roi_height; + platform::CudaAtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2) * part_height + part_h) * + part_width + + part_w, + diff_x); + platform::CudaAtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2 + 1) * part_height + + part_h) * + part_width + + part_w, + diff_y); + } + } + } +} + +template +class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + const LoDTensor* rois = ctx.Input("ROIs"); + const Tensor* trans = ctx.Input("Trans"); + const Tensor* top_count = ctx.Input("TopCount"); + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* trans_grad = ctx.Output(framework::GradVarName("Trans")); + + math::SetConstant set_zero; + auto& dev_ctx = ctx.cuda_device_context(); + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + } + if (trans_grad) { + trans_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, trans_grad, static_cast(0)); + } + + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + const int batch = static_cast(input->dims()[0]); + const int channels = static_cast(input->dims()[1]); + const int height = static_cast(input->dims()[2]); + const int width = static_cast(input->dims()[3]); + const int channels_trans = no_trans ? 2 : trans->dims()[1]; + const int num_rois = rois->dims()[0]; + const int count = num_rois * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + + const T* top_diff = output_grad->data(); + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + T* bottom_data_diff = NULL; + T* bottom_trans_diff = NULL; + if (input_grad) { + bottom_data_diff = input_grad->mutable_data(ctx.GetPlace()); + } + if (trans_grad) { + bottom_trans_diff = + no_trans ? NULL : trans_grad->mutable_data(ctx.GetPlace()); + } + + const T* top_count_data = top_count->data(); + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch, + "The rois_batch_size and imgs batch_size must be the same."); + + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); + + DeformablePSROIPoolBackwardAccKernel<<>>( + count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels, + height, width, pooled_height, pooled_width, output_dim, + bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, + bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height, + group_width, part_height, part_width, num_classes, channels_each_class, + roi_id_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling, + ops::DeformablePSROIPoolCUDAKernel, + ops::DeformablePSROIPoolCUDAKernel); +REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling_grad, + ops::DeformablePSROIPoolGradCUDAKernel, + ops::DeformablePSROIPoolGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h new file mode 100644 index 0000000000..a22ccbd3ff --- /dev/null +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h @@ -0,0 +1,479 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +T bilinear_interp(const T* data, const T x, const T y, const int width, + const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +void DeformablePSROIPoolForwardCPUKernel( + const int count, const T* bottom_data, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const T* bottom_rois, + const T* bottom_trans, const bool no_trans, const float trans_std, + const int sample_per_part, const int output_dim, const int group_height, + const int group_width, const int part_height, const int part_width, + const int num_classes, const int channels_each_class, T* top_data, + T* top_count, const int batch_size, int* roi_batch_id_data, + const LoDTensor* rois) { + for (int ix = 0; ix < count; ix++) { + int pw = ix % pooled_width; + int ph = (ix / pooled_width) % pooled_height; + int ctop = (ix / pooled_width / pooled_height) % output_dim; + int n = ix / pooled_width / pooled_height / output_dim; + const T* offset_bottom_rois = bottom_rois + n * 4; + + int roi_batch_ind = roi_batch_id_data[n]; + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval in each bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_width); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + T sum = 0; + int num_sample = 0; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = std::min(std::max(gw, 0), group_width - 1); + gh = std::min(std::max(gh, 0), group_height - 1); + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels) * height * width; + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = std::min(std::max(w, T(0.)), T(width - 1.)); + h = std::min(std::max(h, T(0.)), height - T(1.)); + int c = (ctop * group_height + gh) * group_width + gw; + // bilinear interpolation to get value + T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, + width, height); + sum += val; + num_sample++; + } + } + top_data[ix] = num_sample == 0 ? static_cast(0) : sum / num_sample; + top_count[ix] = num_sample; + } +} + +template +class DeformablePSROIPoolCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* rois = ctx.Input("ROIs"); + auto* trans = ctx.Input("Trans"); + auto* out = ctx.Output("Output"); + out->mutable_data(ctx.GetPlace()); + auto* top_count = ctx.Output("TopCount"); + top_count->mutable_data(ctx.GetPlace()); + + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + set_zero(dev_ctx, out, static_cast(0)); + set_zero(dev_ctx, top_count, static_cast(0)); + + const int num_rois = rois->dims()[0]; + PADDLE_ENFORCE_EQ(num_rois, out->dims()[0], + "number of rois should be same with number of output"); + + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(ctx.GetPlace()); + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + int batch = static_cast(input->dims()[0]); + int channels = static_cast(input->dims()[1]); + int height = static_cast(input->dims()[2]); + int width = static_cast(input->dims()[3]); + int channels_trans = no_trans ? 2 : trans->dims()[1]; + auto count = num_rois * output_dim * pooled_height * pooled_width; + auto num_classes = no_trans ? 1 : channels_trans / 2; + auto channels_each_class = no_trans ? output_dim : output_dim / num_classes; + PADDLE_ENFORCE(channels_each_class >= 1, + "channels_each must greater than 1"); + + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + T* top_data = out->mutable_data(ctx.GetPlace()); + T* top_count_data = top_count->mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ(rois_batch_size, batch, + "The rois_batch_size must equal to batch_size of img."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + DeformablePSROIPoolForwardCPUKernel( + count, bottom_data, (T)spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, output_dim, group_height, group_width, + part_height, part_width, num_classes, channels_each_class, top_data, + top_count_data, batch, roi_batch_id_data, rois); + } +}; + +template +void DeformablePSROIPoolBackwardAccCPUKernel( + const int count, const T* top_diff, const T* top_count, const int num_rois, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int output_dim, T* bottom_data_diff, T* bottom_trans_diff, + const T* bottom_data, const T* bottom_rois, const T* bottom_trans, + const bool no_trans, const float trans_std, const int sample_per_part, + const int group_height, const int group_width, const int part_height, + const int part_width, const int num_classes, const int channels_each_class, + const int batch_size, int* roi_batch_id_data, const LoDTensor* rois) { + for (int index = 0; index < count; index++) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // location of roi on feature map + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = roi_batch_id_data[n]; + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval in each bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_height); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + + T diff_val = top_diff[index] / top_count[index]; + const T* offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = std::min(std::max(gw, 0), group_width - 1); + gh = std::min(std::max(gh, 0), group_height - 1); + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = std::min(std::max(w, T(0.)), T(width - 1.)); + h = std::min(std::max(h, T(0.)), T(height - 1.)); + int c = (ctop * group_height + gh) * group_width + gw; + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + + // compute coefficient of gradient + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + + // compute gradient of input + if (bottom_data_diff != NULL) { + T* offset_bottom_data_diff_addr00 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x0; + T* offset_bottom_data_diff_addr01 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x0; + T* offset_bottom_data_diff_addr10 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x1; + T* offset_bottom_data_diff_addr11 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x1; + *offset_bottom_data_diff_addr00 = + *offset_bottom_data_diff_addr00 + q00 * diff_val; + *offset_bottom_data_diff_addr01 = + *offset_bottom_data_diff_addr01 + q01 * diff_val; + *offset_bottom_data_diff_addr10 = + *offset_bottom_data_diff_addr10 + q10 * diff_val; + *offset_bottom_data_diff_addr11 = + *offset_bottom_data_diff_addr11 + q11 * diff_val; + } + + // compute gradient of trans + if (no_trans || bottom_trans_diff == NULL) { + continue; + } + + T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + + T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y - + u00 * (1 - dist_y)) * + trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x - + u00 * (1 - dist_x)) * + trans_std * diff_val; + diff_y *= roi_height; + T* offset_bottom_trans_diff_x = + bottom_trans_diff + + (((n * num_classes + class_id) * 2) * part_height + part_h) * + part_width + + part_w; + T* offset_bottom_trans_diff_y = + bottom_trans_diff + + (((n * num_classes + class_id) * 2 + 1) * part_height + part_h) * + part_width + + part_w; + + *offset_bottom_trans_diff_x = *offset_bottom_trans_diff_x + diff_x; + *offset_bottom_trans_diff_y = *offset_bottom_trans_diff_y + diff_y; + } + } + } +} + +template +class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* rois = ctx.Input("ROIs"); + auto* trans = ctx.Input("Trans"); + auto* top_count = ctx.Input("TopCount"); + auto* output_grad = ctx.Input(framework::GradVarName("Output")); + auto* input_grad = ctx.Output(framework::GradVarName("Input")); + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(.0)); + } + auto* trans_grad = ctx.Output(framework::GradVarName("Trans")); + if (trans_grad) { + trans_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, trans_grad, static_cast(.0)); + } + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + const int batch = static_cast(input->dims()[0]); + const int channels = static_cast(input->dims()[1]); + const int height = static_cast(input->dims()[2]); + const int width = static_cast(input->dims()[3]); + const int channels_trans = no_trans ? 2 : trans->dims()[1]; + const int num_rois = rois->dims()[0]; + const int count = num_rois * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(ctx.GetPlace()); + + const T* top_diff = output_grad->data(); + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + T* bottom_data_diff = NULL; + T* bottom_trans_diff = NULL; + if (input_grad) { + bottom_data_diff = input_grad->mutable_data(ctx.GetPlace()); + } + if (trans_grad) { + bottom_trans_diff = + no_trans ? NULL : trans_grad->mutable_data(ctx.GetPlace()); + } + + const T* top_count_data = top_count->data(); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + DeformablePSROIPoolBackwardAccCPUKernel( + count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels, + height, width, pooled_height, pooled_width, output_dim, + bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, + bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height, + group_width, part_height, part_width, num_classes, channels_each_class, + batch, roi_batch_id_data, rois); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9bc4099613..8173440a52 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -203,6 +203,7 @@ __all__ = [ 'where', 'sign', 'deformable_conv', + 'deformable_roi_pooling', ] kIgnoreIndex = -100 @@ -12088,3 +12089,117 @@ def deformable_conv(input, output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) return output + + +def deformable_roi_pooling(input, + rois, + trans, + no_trans=False, + spatial_scale=1.0, + group_size=[1, 1], + pooled_height=1, + pooled_width=1, + part_size=None, + sample_per_part=1, + trans_std=0.1, + position_sensitive=False, + name=None): + """ + Deformable PSROI Pooling Layer + + Args: + input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is + [N,C,H,W]. Where N is batch size,C is number of input channels,H + is height of the feature, and W is the width of the feature. + rois (Variable): ROIs (Regions of Interest) to pool over.It should be + a 2-D LoDTensor of shape (num_rois, 4), the lod level + is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is + the top left coordinates, and (x2, y2) is the bottom + right coordinates. + trans (Variable): Offset of features on ROIs while pooling.The format is NCHW, where + N is number of ROIs, C is number of channels, which indicate the offset distance + in the x and y directions, H is pooled height, and W is pooled width. + no_trans (bool): Whether to add offset to get new value or not while roi pooling, which + value is True or False. Default: False. + spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width). + Equals the reciprocal of total stride in convolutional layers, Default: 1.0. + group_size (list|tuple): The number of groups which input channels are divided.(eg.number of input channels + is k1*k2*(C+1), which k1 and k2 are group width and height and C+1 is number of output + chanels. eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1]. + pooled_height (integer): The pooled output height. Default: 1. + pooled_width (integer): The pooled output width. Default: 1. + part_size (list|tuple): The height and width of offset, eg.(4, 6), which height is 4 and width is 6, Default: + if None, default value is [pooled_height, pooled_width]. + sample_per_part (integer): The number of samples in each bin. Default: 1. + trans_std (float): Coefficient of offset. Default: 0.1. + position_sensitive (bool): Whether to choose deformable psroi pooling mode or not. Default: False. + name (str): Name of layer. Default: None. + Returns: + Variable: The tensor variable storing the deformable psroi pooling \ + result. + + + Examples: + .. code-block:: python + + input = fluid.layers.data(name="input", + shape=[2, 192, 64, 64], + dtype='float32', + append_batch_size=False) + rois = fluid.layers.data(name="rois", + shape=[4], + dtype='float32', + lod_level=1) + trans = fluid.layers.data(name="trans", + shape=[2, 384, 64, 64], + dtype='float32', + append_batch_size=False) + x = fluid.layers.nn.deformable_roi_pooling(input=input, + rois=rois, + trans=trans, + no_trans=False, + spatial_scale=1.0, + group_size=(1, 1), + pooled_height=8, + pooled_width=8, + part_size=(8, 8), + sample_per_part=4, + trans_std=0.1, + position_sensitive=False) + """ + + input_channels = input.shape[1] + if position_sensitive == False: + output_channels = input_channels + else: + output_channels = input_channels / pooled_height / pooled_width + + if part_size is None: + part_height = pooled_height + part_width = pooled_width + part_size = [part_height, part_width] + part_size = utils.convert_to_list(part_size, 2, 'part_size') + group_size = utils.convert_to_list(group_size, 2, 'group_size') + helper = LayerHelper('deformable_psroi_pooling', **locals()) + dtype = helper.input_dtype() + output = helper.create_variable_for_type_inference(dtype) + top_count = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type="deformable_psroi_pooling", + inputs={"Input": input, + "ROIs": rois, + "Trans": trans}, + outputs={"Output": output, + "TopCount": top_count}, + attrs={ + "no_trans": no_trans, + "spatial_scale": spatial_scale, + "output_dim": output_channels, + "group_size": group_size, + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "part_size": part_size, + "sample_per_part": sample_per_part, + "trans_std": trans_std + }) + return output diff --git a/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py new file mode 100644 index 0000000000..6aa408e5d7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py @@ -0,0 +1,369 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +from op_test import OpTest + + +def set_input(input, rois, trans): + inputs = {'Input': input, "ROIs": rois, "Trans": trans} + return inputs + + +def set_attrs(no_trans, spatial_scale, output_channels, group_size, + pooled_height, pooled_width, part_size, sample_per_part, + trans_std): + attrs = { + 'no_trans': no_trans, + 'spatial_scale': spatial_scale, + 'output_dim': output_channels, + 'group_size': group_size, + 'pooled_height': pooled_height, + 'pooled_width': pooled_width, + 'part_size': part_size, + 'sample_per_part': sample_per_part, + 'trans_std': trans_std + } + return attrs + + +def set_outputs(output, top_count): + outputs = { + 'Output': output.astype('float32'), + 'TopCount': top_count.astype('float32') + } + return outputs + + +class TestDeformablePSROIPoolOp(OpTest): + def set_data(self): + self.start_test1() + self.start_test2() + self.start_test3() + self.start_test4() + + def start_test1(self): + self.init_test_case1() + self.make_rois() + self.calc_deformable_psroi_pooling() + + inputs = self.input + rois = (self.rois[:, 1:5], self.rois_lod) + trans = self.trans + self.inputs = set_input(inputs, rois, trans) + + no_trans = self.no_trans + spatial_scale = self.spatial_scale + output_channels = self.output_channels + group_size = self.group_size + pooled_height = self.pooled_height + pooled_width = self.pooled_width + part_size = self.part_size + sample_per_part = self.sample_per_part + trans_std = self.trans_std + + self.attrs = set_attrs(no_trans, spatial_scale, output_channels, + group_size, pooled_height, pooled_width, + part_size, sample_per_part, trans_std) + + output = self.out.astype('float32') + top_count = self.top_count.astype('float32') + self.outputs = set_outputs(output, top_count) + + def start_test2(self): + self.init_test_case2() + self.make_rois() + self.calc_deformable_psroi_pooling() + + inputs = self.input + rois = (self.rois[:, 1:5], self.rois_lod) + trans = self.trans + self.inputs = set_input(inputs, rois, trans) + + no_trans = self.no_trans + spatial_scale = self.spatial_scale + output_channels = self.output_channels + group_size = self.group_size + pooled_height = self.pooled_height + pooled_width = self.pooled_width + part_size = self.part_size + sample_per_part = self.sample_per_part + trans_std = self.trans_std + + self.attrs = set_attrs(no_trans, spatial_scale, output_channels, + group_size, pooled_height, pooled_width, + part_size, sample_per_part, trans_std) + + output = self.out.astype('float32') + top_count = self.top_count.astype('float32') + self.outputs = set_outputs(output, top_count) + + def start_test3(self): + self.init_test_case3() + self.make_rois() + self.calc_deformable_psroi_pooling() + + inputs = self.input + rois = (self.rois[:, 1:5], self.rois_lod) + trans = self.trans + self.inputs = set_input(inputs, rois, trans) + + no_trans = self.no_trans + spatial_scale = self.spatial_scale + output_channels = self.output_channels + group_size = self.group_size + pooled_height = self.pooled_height + pooled_width = self.pooled_width + part_size = self.part_size + sample_per_part = self.sample_per_part + trans_std = self.trans_std + + self.attrs = set_attrs(no_trans, spatial_scale, output_channels, + group_size, pooled_height, pooled_width, + part_size, sample_per_part, trans_std) + + output = self.out.astype('float32') + top_count = self.top_count.astype('float32') + self.outputs = set_outputs(output, top_count) + + def start_test4(self): + self.init_test_case4() + self.make_rois() + self.calc_deformable_psroi_pooling() + + inputs = self.input + rois = (self.rois[:, 1:5], self.rois_lod) + trans = self.trans + self.inputs = set_input(inputs, rois, trans) + + no_trans = self.no_trans + spatial_scale = self.spatial_scale + output_channels = self.output_channels + group_size = self.group_size + pooled_height = self.pooled_height + pooled_width = self.pooled_width + part_size = self.part_size + sample_per_part = self.sample_per_part + trans_std = self.trans_std + + self.attrs = set_attrs(no_trans, spatial_scale, output_channels, + group_size, pooled_height, pooled_width, + part_size, sample_per_part, trans_std) + + output = self.out.astype('float32') + top_count = self.top_count.astype('float32') + self.outputs = set_outputs(output, top_count) + + def init_test_case1(self): + self.batch_size = 3 + self.channels = 3 * 2 * 2 + self.height = 12 + self.width = 12 + self.input_dim = [ + self.batch_size, self.channels, self.height, self.width + ] + self.no_trans = False + self.spatial_scale = 1.0 / 4.0 + self.output_channels = 12 + self.group_size = [1, 1] + self.pooled_height = 4 + self.pooled_width = 4 + self.part_size = [4, 4] + self.sample_per_part = 2 + self.trans_std = 0.1 + self.input = np.random.random(self.input_dim).astype('float32') + + def init_test_case2(self): + self.batch_size = 2 + self.channels = 3 * 2 * 2 + self.height = 12 + self.width = 12 + self.input_dim = [ + self.batch_size, self.channels, self.height, self.width + ] + self.no_trans = True + self.spatial_scale = 1.0 / 2.0 + self.output_channels = 12 + self.group_size = [1, 1] + self.pooled_height = 7 + self.pooled_width = 7 + self.part_size = [7, 7] + self.sample_per_part = 4 + self.trans_std = 0.1 + self.input = np.random.random(self.input_dim).astype('float32') + + def init_test_case3(self): + self.batch_size = 2 + self.channels = 3 * 2 * 2 + self.height = 12 + self.width = 12 + self.input_dim = [ + self.batch_size, self.channels, self.height, self.width + ] + self.no_trans = False + self.spatial_scale = 1.0 / 4.0 + self.output_channels = 12 + self.group_size = [1, 1] + self.pooled_height = 3 + self.pooled_width = 3 + self.part_size = [3, 3] + self.sample_per_part = 3 + self.trans_std = 0.2 + self.input = np.random.random(self.input_dim).astype('float32') + + def init_test_case4(self): + self.batch_size = 2 + self.channels = 3 * 2 * 2 + self.height = 12 + self.width = 12 + self.input_dim = [ + self.batch_size, self.channels, self.height, self.width + ] + self.no_trans = True + self.spatial_scale = 1.0 / 2.0 + self.output_channels = 12 + self.group_size = [1, 1] + self.pooled_height = 6 + self.pooled_width = 2 + self.part_size = [6, 6] + self.sample_per_part = 6 + self.trans_std = 0.4 + self.input = np.random.random(self.input_dim).astype('float32') + + def make_rois(self): + rois = [] + self.rois_lod = [[]] + for bno in range(self.batch_size): + self.rois_lod[0].append(bno + 1) + for i in range(bno + 1): + x_1 = np.random.random_integers( + 0, self.width // self.spatial_scale - self.pooled_width) + y_1 = np.random.random_integers( + 0, self.height // self.spatial_scale - self.pooled_height) + x_2 = np.random.random_integers( + x_1 + self.pooled_width, self.width // self.spatial_scale) + y_2 = np.random.random_integers( + y_1 + self.pooled_height, self.height // self.spatial_scale) + roi = [bno, x_1, y_1, x_2, y_2] + rois.append(roi) + self.rois_num = len(rois) + self.rois = np.array(rois).astype("float32") + + def dmc_bilinear(self, data_im, p_h, p_w): + h_low = int(np.floor(p_h)) + w_low = int(np.floor(p_w)) + h_high = h_low + 1 + w_high = w_low + 1 + l_h = p_h - h_low + l_w = p_w - w_low + h_h = 1 - l_h + h_w = 1 - l_w + v_1 = 0 + if h_low >= 0 and w_low >= 0: + v_1 = data_im[h_low, w_low] + v_2 = 0 + if h_low >= 0 and w_high <= self.width - 1: + v_2 = data_im[h_low, w_high] + v_3 = 0 + if h_high <= self.height - 1 and w_low >= 0: + v_3 = data_im[h_high, w_low] + v_4 = 0 + if h_high <= self.height - 1 and w_high <= self.width - 1: + v_4 = data_im[h_high, w_high] + w_1, w_2, w_3, w_4 = h_h * h_w, h_h * l_w, l_h * h_w, l_h * l_w + val = w_1 * v_1 + w_2 * v_2 + w_3 * v_3 + w_4 * v_4 + return val + + def calc_deformable_psroi_pooling(self): + output_shape = (self.rois_num, self.output_channels, self.pooled_height, + self.pooled_width) + self.out = np.zeros(output_shape) + self.trans = np.random.rand(self.rois_num, 2, self.part_size[0], + self.part_size[1]).astype('float32') + self.top_count = np.random.random((output_shape)).astype('float32') + count = self.rois_num * self.output_channels * self.pooled_height * self.pooled_width + for index in range(count): + p_w = int(index % self.pooled_width) + p_h = int(index / self.pooled_width % self.pooled_height) + ctop = int(index / self.pooled_width / self.pooled_height % + self.output_channels) + n_out = int(index / self.pooled_width / self.pooled_height / + self.output_channels) + roi = self.rois[n_out] + roi_batch_id = int(roi[0]) + roi_start_w = int(np.round(roi[1])) * self.spatial_scale - 0.5 + roi_start_h = int(np.round(roi[2])) * self.spatial_scale - 0.5 + roi_end_w = int(np.round(roi[3] + 1)) * self.spatial_scale - 0.5 + roi_end_h = int(np.round(roi[4] + 1)) * self.spatial_scale - 0.5 + roi_width = max(roi_end_w - roi_start_w, 0.1) + roi_height = max(roi_end_h - roi_start_h, 0.1) + bin_size_h = float(roi_height) / float(self.pooled_height) + bin_size_w = float(roi_width) / float(self.pooled_width) + sub_bin_size_h = bin_size_h / self.sample_per_part + sub_bin_size_w = bin_size_w / self.sample_per_part + part_h = int(np.floor(p_h) / self.pooled_height * self.part_size[0]) + part_w = int(np.floor(p_w) / self.pooled_width * self.part_size[1]) + if self.no_trans: + trans_x = 0 + trans_y = 0 + else: + trans_x = self.trans[n_out][0][part_h][part_w] * self.trans_std + trans_y = self.trans[n_out][1][part_h][part_w] * self.trans_std + wstart = p_w * bin_size_w + roi_start_w + wstart = wstart + trans_x * roi_width + hstart = p_h * bin_size_h + roi_start_h + hstart = hstart + trans_y * roi_height + sum = 0 + num_sample = 0 + g_w = np.floor(p_w * self.group_size[0] / self.pooled_height) + g_h = np.floor(p_h * self.group_size[1] / self.pooled_width) + g_w = min(max(g_w, 0), self.group_size[0] - 1) + g_h = min(max(g_h, 0), self.group_size[1] - 1) + input_i = self.input[roi_batch_id] + for i_w in range(self.sample_per_part): + for i_h in range(self.sample_per_part): + w_sample = wstart + i_w * sub_bin_size_w + h_sample = hstart + i_h * sub_bin_size_h + if w_sample < -0.5 or w_sample > self.width - 0.5 or \ + h_sample < -0.5 or h_sample > self.height - 0.5: + continue + w_sample = min(max(w_sample, 0.), self.width - 1.) + h_sample = min(max(h_sample, 0.), self.height - 1.) + c_sample = int((ctop * self.group_size[0] + g_h) * + self.group_size[1] + g_w) + val = self.dmc_bilinear(input_i[c_sample], h_sample, + w_sample) + sum = sum + val + num_sample = num_sample + 1 + if num_sample == 0: + self.out[n_out][ctop][p_h][p_w] = 0 + else: + self.out[n_out][ctop][p_h][p_w] = sum / num_sample + self.top_count[n_out][ctop][p_h][p_w] = num_sample + + def setUp(self): + self.op_type = "deformable_psroi_pooling" + self.set_data() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Input'], 'Output') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 2204ea21c0..2d4ddb01d4 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1989,6 +1989,35 @@ class TestBook(LayerTest): padding=1) return (out) + def test_deform_roi_pooling(self): + with program_guard(fluid.default_main_program(), + fluid.default_startup_program()): + input = layers.data( + name='input', + shape=[2, 3, 32, 32], + dtype='float32', + append_batch_size=False) + rois = layers.data( + name="rois", shape=[4], dtype='float32', lod_level=1) + trans = layers.data( + name="trans", + shape=[2, 3, 32, 32], + dtype='float32', + append_batch_size=False) + out = layers.deformable_roi_pooling( + input=input, + rois=rois, + trans=trans, + no_trans=False, + spatial_scale=1.0, + group_size=(1, 1), + pooled_height=8, + pooled_width=8, + part_size=(8, 8), + sample_per_part=4, + trans_std=0.1) + return (out) + if __name__ == '__main__': unittest.main() -- GitLab