未验证 提交 871af28d 编写于 作者: C cjt222 提交者: GitHub

add deformable psroi pooling (#17827)

* add deformable psroi pooling

* test=develop

* test=develop

* test=develop
modify format

* fix bug

* test=develop run ci

* test=develop
add API.spec

* add test_layers.py

* run ci again

* test=develop
run ci again

* run ci again

* test=develop
run ci again

* test=develop
run ci again

* test=develop
run ci again

* add space between two lines

* test=develop
add space between two lines

* test=develop
add space between lines

* test=develop
modify comment in nn.py

* test=develop
add space between two lines

* test=develop
add space between two lines

* update API.spec

* run ci again

* test=develop
run ci again

* rerun ci

* test=develop
rerun ci

* change input shape

* run ci

* test=develop
run ci

* modify format of nn.py

* test=develop

* test=develop

* test=develop
update API.spec

* test=develop
fix API doc

* modify API comment

* modift API comment

* test=develop
update API.spec

* test=develop
modify comment

* test=develop
modift comment

* test=develop
modift comment

* test=develop
update API.spec

* test=develop
modify comment

* test=develop
add inference in nn.py

* test=develop
update API.spec

* test=develop
resolve confict

* test=develop
update API.spec
上级 40885c22
......@@ -239,6 +239,7 @@ paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=No
paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'ccf6bb7912afd2818d24bc45461e807a'))
paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', 'c896b66265a60bd3c5510f66e6e02919'))
paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6'))
paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '65b8dbe13e00c4dc8224652f6ff89540'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '6e19128b46936edf9f3fad77860a1da8'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'dce69a78638da8f7ad80b1fc00ed2029'))
paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6'))
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include <iostream>
#include <memory>
#include <vector>
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace operators {
class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input",
"(Tensor), "
"the input of Deformable PSROIPooling. "
"The shape of input tensor is [N,C,H,W]. Where N is batch size, "
"C is number of input channels, "
"H is height of the feature, and "
"W is the width of the feature.");
AddInput("ROIs",
"(LoDTensor), "
"ROIs (Regions of Interest) to pool over. "
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
"given as [[x1, y1, x2, y2], ...]. "
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the bottom right coordinates.");
AddInput("Trans",
"(Tensor),"
"offset of features on ROIs while pooling. "
"The format is NCHW, where N is number of ROIs, "
"C is number of channels, which indicate the offset distance "
"in the x and y directions, "
"H is pooled height, and "
"W is pooled width.");
AddAttr<bool>("no_trans",
"(bool), "
"whether add offset to get new value or not while roi "
"pooling, which value is True or False");
AddAttr<float>("spatial_scale",
"(float), "
"ratio of input feature map height (or width) to "
"raw image height (or width). Equals the reciprocal "
"of total stride in convolutional layers.");
AddAttr<int>("output_dim",
"(int), "
"the number of output channels, which should be less than "
"input channels. Deformable roi_pooling requires "
"output_channels = input_channels, while deformable "
"psroi_pooling requires output_channels = input_channels "
"* pooled_height * pooled_width");
AddAttr<std::vector<int>>(
"group_size",
"(vector<int>), "
"the number of groups which input channels are divided."
"(eg.number of input channels is k1*k2*(C+1), which k1 and k2 "
"are group width and height and C+1 is number of output "
"chanels. eg.(4, 6), which 4 is height of group and 6 is "
"width of group");
AddAttr<int>("pooled_height",
"(int), "
"the pooled output height.");
AddAttr<int>("pooled_width",
"(int), "
"the pooled output width.");
AddAttr<std::vector<int>>(
"part_size",
"(vector<int>), "
"the height and width of offset, eg.(4, 6), which height is 4 "
" and width is 6");
AddAttr<int>("sample_per_part",
"(int), "
"the number of samples in each bin");
AddAttr<float>("trans_std",
"(float), "
"Coefficient of offset");
AddOutput("TopCount",
"(Tensor), "
"record the number of pixel in average pooling to in each bin. "
"The format is NCHW, where N is the number of ROIs, "
"C is the number of output channels, "
"H is the height of output, and "
"W is the width of output.");
AddOutput("Output",
"(Tensor), "
"the output of Deformable PSROIPooling. "
"The format is NCHW, where N is the number of ROIs, "
"C is the number of output channels, "
"H is the height of output, and "
"W is thewidth of output. ");
AddComment(R"DOC(
**DeformablePSROIPooling Operator**
DeformablePSROIPooling is a new method based Region of interest pooling
(also known as RoI pooling).
The operator has four steps:
1. Dividing each region proposal into equal-sized sections with
the pooled_width and pooled_height.
2. Add offset to pixel in ROI to get new location and the new value which are
computed directly through bilinear interpolation with four nearest pixel.
3. Sample several points to get average values in each bin.
4. Copying these average values to the output buffer.
DeformablePSROIPooling is part of Deformable Convolutional Networks,
please refer to https://arxiv.org/abs/1703.06211 for more details.
)DOC");
}
};
class DeformablePSROIPoolOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of DeformablePSROIPoolOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ROIs"),
"Input(ROIs) of DeformablePSROIPoolOp "
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Trans"),
"Input(Trans) of DeformablePSROIPoolOp "
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of DeformablePSROIPoolOp "
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("TopCount"),
"Output(TopCount) of DeformablePSROIPoolOp "
"should not be null.");
auto input_dims = ctx->GetInputDim("Input");
auto rois_dims = ctx->GetInputDim("ROIs");
auto trans_dims = ctx->GetInputDim("Trans");
PADDLE_ENFORCE(rois_dims.size() == 2,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[ x1, y1, x2, y2], ...].");
PADDLE_ENFORCE(trans_dims.size() == 4,
"The format of Input Trans is (N, 2, H, W).");
auto pooled_height = ctx->Attrs().Get<int>("pooled_height");
auto pooled_width = ctx->Attrs().Get<int>("pooled_width");
auto spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
auto output_channels = ctx->Attrs().Get<int>("output_dim");
auto group_size = ctx->Attrs().Get<std::vector<int>>("group_size");
auto group_height = group_size[0];
auto group_width = group_size[1];
auto part_size = ctx->Attrs().Get<std::vector<int>>("part_size");
auto part_height = part_size[0];
auto part_width = part_size[1];
auto sample_per_part = ctx->Attrs().Get<int>("sample_per_part");
auto trans_std = ctx->Attrs().Get<float>("trans_std");
PADDLE_ENFORCE(trans_std >= 0.0f, "trans_std must greater than 0.0");
PADDLE_ENFORCE(input_dims[1] >= output_channels,
"input channels must greater than out_channels");
PADDLE_ENFORCE_GT(pooled_height, 0,
"The pooled height must greater than 0");
PADDLE_ENFORCE_GT(pooled_width, 0, "The pooled width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
"The spatial scale must greater than 0");
PADDLE_ENFORCE_EQ(group_size.size(), 2,
"The size of group_size should be 2.");
PADDLE_ENFORCE_GT(group_height, 0,
"The group_height in group_size must greater than 0");
PADDLE_ENFORCE_GT(group_width, 0,
"The group_width in group_size must greater than 0");
PADDLE_ENFORCE_EQ(part_size.size(), 2,
"The size of part_size should be 2.");
PADDLE_ENFORCE_GT(part_height, 0,
"The part_height in part_size must greater than 0");
PADDLE_ENFORCE_GT(part_width, 0,
"The part_width in part_size must greater than 0");
PADDLE_ENFORCE(part_height <= trans_dims[2],
"The height of trans must greater than part_height");
PADDLE_ENFORCE(part_width <= trans_dims[3],
"The width of trans must greater than part_width");
PADDLE_ENFORCE_GT(sample_per_part, 0,
"The sample_per_part must greater than 0");
auto out_dims = input_dims;
out_dims[0] = rois_dims[0];
out_dims[1] = output_channels;
out_dims[2] = pooled_height;
out_dims[3] = pooled_width;
ctx->SetOutputDim("Output", out_dims);
ctx->SetOutputDim("TopCount", out_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
ctx.device_context());
}
};
class DeformablePSROIPoolGradOpDescMaker
: public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("deformable_psroi_pooling_grad");
op->SetInput("Input", Input("Input"));
op->SetInput("Trans", Input("Trans"));
op->SetInput("ROIs", Input("ROIs"));
op->SetInput("TopCount", Output("TopCount"));
op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
op->SetOutput(framework::GradVarName("Trans"), InputGrad("Trans"));
op->SetAttrMap(Attrs());
return op;
}
};
class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")),
"The gradient of Output should not be null.");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
ctx->SetOutputDim(framework::GradVarName("Input"),
ctx->GetInputDim("Input"));
}
if (ctx->HasOutput(framework::GradVarName("Trans"))) {
ctx->SetOutputDim(framework::GradVarName("Trans"),
ctx->GetInputDim("Trans"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("Trans")->type(),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(deformable_psroi_pooling, ops::DeformablePSROIPoolOp,
ops::DeformablePSROIPoolOpMaker,
ops::DeformablePSROIPoolGradOpDescMaker);
REGISTER_OPERATOR(deformable_psroi_pooling_grad,
ops::DeformablePSROIPoolGradOp);
REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling,
ops::DeformablePSROIPoolCPUKernel<CPU, float>,
ops::DeformablePSROIPoolCPUKernel<CPU, double>);
REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling_grad,
ops::DeformablePSROIPoolGradCPUKernel<CPU, float>,
ops::DeformablePSROIPoolGradCPUKernel<CPU, double>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdio.h>
#include <algorithm>
#include <iostream>
#include <limits>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
static inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template <typename T>
__device__ T bilinear_interpolation(const T* data, const T x, const T y,
const int width, const int height) {
int x1 = floor(x);
int x2 = ceil(x);
int y1 = floor(y);
int y2 = ceil(y);
T dist_x = static_cast<T>(x - x1);
T dist_y = static_cast<T>(y - y1);
T value11 = data[y1 * width + x1];
T value12 = data[y2 * width + x1];
T value21 = data[y1 * width + x2];
T value22 = data[y2 * width + x2];
T value = (1 - dist_x) * (1 - dist_y) * value11 +
(1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 +
dist_x * dist_y * value22;
return value;
}
template <typename T>
__global__ void DeformablePSROIPoolForwardKernel(
const int count, const T* bottom_data, const T spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const T* bottom_rois,
const T* bottom_trans, const bool no_trans, const T trans_std,
const int sample_per_part, const int output_dim, const int group_height,
const int group_width, const int part_height, const int part_width,
const int num_classes, const int channels_each_class, T* top_data,
T* top_count, int* roi_batch_id_data) {
CUDA_KERNEL_LOOP(index, count) {
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
const T* offset_bottom_rois = bottom_rois + n * 4;
int roi_batch_ind = roi_batch_id_data[n];
// location of roi on feature map
T roi_start_w =
static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
T roi_start_h =
static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
T roi_end_w =
static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
T roi_end_h =
static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
// width and height of roi
T roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0
T roi_height = max(roi_end_h - roi_start_h, 0.1);
// width and height of each bin
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
// sampling interval ineach bin
T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
// obtain offset of roi
int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
int class_id = ctop / channels_each_class;
T trans_x =
no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
T trans_y = no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
// location of start after adding offset
T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
T sum = 0;
int count = 0;
int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
gw = min(max(gw, 0), group_width - 1);
gh = min(max(gh, 0), group_height - 1);
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels) * height * width;
// sampling in each bin
for (int ih = 0; ih < sample_per_part; ih++) {
for (int iw = 0; iw < sample_per_part; iw++) {
T w = wstart + iw * sub_bin_size_w;
T h = hstart + ih * sub_bin_size_h;
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_height + gh) * group_width + gw;
// bilinear interpolation
T val = bilinear_interpolation(offset_bottom_data + c * height * width,
w, h, width, height);
sum += val;
count++;
}
}
top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
top_count[index] = count;
}
}
template <typename DeviceContext, typename T>
class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
const Tensor* trans = ctx.Input<Tensor>("Trans");
Tensor* out = ctx.Output<Tensor>("Output");
out->mutable_data<T>(ctx.GetPlace());
Tensor* top_count = ctx.Output<Tensor>("TopCount");
top_count->mutable_data<T>(ctx.GetPlace());
auto no_trans = ctx.Attr<bool>("no_trans");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto output_dim = ctx.Attr<int>("output_dim");
auto group_size = ctx.Attr<std::vector<int>>("group_size");
auto group_height = group_size[0];
auto group_width = group_size[1];
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto part_size = ctx.Attr<std::vector<int>>("part_size");
auto part_height = part_size[0];
auto part_width = part_size[1];
auto sample_per_part = ctx.Attr<int>("sample_per_part");
auto trans_std = ctx.Attr<float>("trans_std");
const int batch = static_cast<int>(input->dims()[0]);
const int channels = static_cast<int>(input->dims()[1]);
const int height = static_cast<int>(input->dims()[2]);
const int width = static_cast<int>(input->dims()[3]);
const int channels_trans = no_trans ? 2 : trans->dims()[1];
const int num_rois = rois->dims()[0];
PADDLE_ENFORCE_EQ(num_rois, out->dims()[0],
"number of rois should be same with number of output");
const int count = num_rois * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class =
no_trans ? output_dim : output_dim / num_classes;
PADDLE_ENFORCE(channels_each_class >= 1,
"channels_each must greater than 1");
const T* bottom_data = input->data<T>();
const T* bottom_rois = rois->data<T>();
const T* bottom_trans = no_trans ? NULL : trans->data<T>();
framework::Tensor roi_batch_id_list;
roi_batch_id_list.Resize({num_rois});
auto cplace = platform::CPUPlace();
int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(
rois_batch_size, batch,
"The rois_batch_size and imgs batch_size must be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
auto& dev_ctx = ctx.cuda_device_context();
auto& allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = allocator.Allocate(bytes);
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
dev_ctx.stream());
T* top_data = out->mutable_data<T>(ctx.GetPlace());
T* top_count_data = top_count->mutable_data<T>(ctx.GetPlace());
DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0,
dev_ctx.stream()>>>(
count, bottom_data, (T)spatial_scale, channels, height, width,
pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
(T)trans_std, sample_per_part, output_dim, group_height, group_width,
part_height, part_width, num_classes, channels_each_class, top_data,
top_count_data, roi_id_data);
}
};
template <typename T>
__global__ void DeformablePSROIPoolBackwardAccKernel(
const int count, const T* top_diff, const T* top_count, const int num_rois,
const T spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int output_dim, T* bottom_data_diff, T* bottom_trans_diff,
const T* bottom_data, const T* bottom_rois, const T* bottom_trans,
const bool no_trans, const T trans_std, const int sample_per_part,
const int group_height, const int group_width, const int part_height,
const int part_width, const int num_classes, const int channels_each_class,
int* roi_batch_id_data) {
CUDA_KERNEL_LOOP(index, count) {
// The output is in order (n, ctop, ph, pw)
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
int num_box = count / pooled_height / pooled_width / output_dim;
const T* offset_bottom_rois = bottom_rois + n * 4;
int roi_batch_ind = roi_batch_id_data[n];
// location of roi on feature map
T roi_start_w =
static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
T roi_start_h =
static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
T roi_end_w =
static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
T roi_end_h =
static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
// width and height of roi
T roi_width = max(roi_end_w - roi_start_w, 0.1);
T roi_height = max(roi_end_h - roi_start_h, 0.1);
// width and height of each bin
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
// sampling interval in each bin
T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
// obtain offset of roi
int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
int class_id = ctop / channels_each_class;
T trans_x =
no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
T trans_y = no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
// location of start after adding offset
T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
if (top_count[index] <= 0) {
continue;
}
T diff_val = top_diff[index] / top_count[index];
const T* offset_bottom_data =
bottom_data + roi_batch_ind * channels * height * width;
int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
gw = min(max(gw, 0), group_width - 1);
gh = min(max(gh, 0), group_height - 1);
// sampling in each bin
for (int ih = 0; ih < sample_per_part; ih++) {
for (int iw = 0; iw < sample_per_part; iw++) {
T w = wstart + iw * sub_bin_size_w;
T h = hstart + ih * sub_bin_size_h;
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
continue;
}
w = min(max(w, 0.), width - 1.);
h = min(max(h, 0.), height - 1.);
int c = (ctop * group_height + gh) * group_width + gw;
int x0 = floor(w);
int x1 = ceil(w);
int y0 = floor(h);
int y1 = ceil(h);
// compute coefficient of gradient
T dist_x = w - x0, dist_y = h - y0;
T q00 = (1 - dist_x) * (1 - dist_y);
T q01 = (1 - dist_x) * dist_y;
T q10 = dist_x * (1 - dist_y);
T q11 = dist_x * dist_y;
int bottom_index_base = c * height * width;
// compute gradient of input
if (bottom_data_diff) {
platform::CudaAtomicAdd(
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y0 * width + x0,
q00 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y1 * width + x0,
q01 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y0 * width + x1,
q10 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y1 * width + x1,
q11 * diff_val);
}
// compute gradient of trans
if (no_trans || bottom_trans_diff == NULL) {
continue;
}
T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y -
u00 * (1 - dist_y)) *
trans_std * diff_val;
diff_x *= roi_width;
T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x -
u00 * (1 - dist_x)) *
trans_std * diff_val;
diff_y *= roi_height;
platform::CudaAtomicAdd(
bottom_trans_diff +
(((n * num_classes + class_id) * 2) * part_height + part_h) *
part_width +
part_w,
diff_x);
platform::CudaAtomicAdd(
bottom_trans_diff +
(((n * num_classes + class_id) * 2 + 1) * part_height +
part_h) *
part_width +
part_w,
diff_y);
}
}
}
}
template <typename DeviceContext, typename T>
class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* input = ctx.Input<Tensor>("Input");
const LoDTensor* rois = ctx.Input<LoDTensor>("ROIs");
const Tensor* trans = ctx.Input<Tensor>("Trans");
const Tensor* top_count = ctx.Input<Tensor>("TopCount");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = ctx.cuda_device_context();
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(0));
}
if (trans_grad) {
trans_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, trans_grad, static_cast<T>(0));
}
auto no_trans = ctx.Attr<bool>("no_trans");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto output_dim = ctx.Attr<int>("output_dim");
auto group_size = ctx.Attr<std::vector<int>>("group_size");
auto group_height = group_size[0];
auto group_width = group_size[1];
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto part_size = ctx.Attr<std::vector<int>>("part_size");
auto part_height = part_size[0];
auto part_width = part_size[1];
auto sample_per_part = ctx.Attr<int>("sample_per_part");
auto trans_std = ctx.Attr<float>("trans_std");
const int batch = static_cast<int>(input->dims()[0]);
const int channels = static_cast<int>(input->dims()[1]);
const int height = static_cast<int>(input->dims()[2]);
const int width = static_cast<int>(input->dims()[3]);
const int channels_trans = no_trans ? 2 : trans->dims()[1];
const int num_rois = rois->dims()[0];
const int count = num_rois * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class =
no_trans ? output_dim : output_dim / num_classes;
const T* top_diff = output_grad->data<T>();
const T* bottom_data = input->data<T>();
const T* bottom_rois = rois->data<T>();
const T* bottom_trans = no_trans ? NULL : trans->data<T>();
T* bottom_data_diff = NULL;
T* bottom_trans_diff = NULL;
if (input_grad) {
bottom_data_diff = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (trans_grad) {
bottom_trans_diff =
no_trans ? NULL : trans_grad->mutable_data<T>(ctx.GetPlace());
}
const T* top_count_data = top_count->data<T>();
framework::Tensor roi_batch_id_list;
roi_batch_id_list.Resize({num_rois});
auto cplace = platform::CPUPlace();
int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(
rois_batch_size, batch,
"The rois_batch_size and imgs batch_size must be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
auto& allocator =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
int bytes = roi_batch_id_list.numel() * sizeof(int);
auto roi_ptr = allocator.Allocate(bytes);
int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
dev_ctx.stream());
DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS,
0, dev_ctx.stream()>>>(
count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels,
height, width, pooled_height, pooled_width, output_dim,
bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height,
group_width, part_height, part_width, num_classes, channels_each_class,
roi_id_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling,
ops::DeformablePSROIPoolCUDAKernel<CUDA, float>,
ops::DeformablePSROIPoolCUDAKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling_grad,
ops::DeformablePSROIPoolGradCUDAKernel<CUDA, float>,
ops::DeformablePSROIPoolGradCUDAKernel<CUDA, double>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T>
T bilinear_interp(const T* data, const T x, const T y, const int width,
const int height) {
int x1 = floor(x);
int x2 = ceil(x);
int y1 = floor(y);
int y2 = ceil(y);
T dist_x = static_cast<T>(x - x1);
T dist_y = static_cast<T>(y - y1);
T value11 = data[y1 * width + x1];
T value12 = data[y2 * width + x1];
T value21 = data[y1 * width + x2];
T value22 = data[y2 * width + x2];
T value = (1 - dist_x) * (1 - dist_y) * value11 +
(1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 +
dist_x * dist_y * value22;
return value;
}
template <typename T>
void DeformablePSROIPoolForwardCPUKernel(
const int count, const T* bottom_data, const T spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const T* bottom_rois,
const T* bottom_trans, const bool no_trans, const float trans_std,
const int sample_per_part, const int output_dim, const int group_height,
const int group_width, const int part_height, const int part_width,
const int num_classes, const int channels_each_class, T* top_data,
T* top_count, const int batch_size, int* roi_batch_id_data,
const LoDTensor* rois) {
for (int ix = 0; ix < count; ix++) {
int pw = ix % pooled_width;
int ph = (ix / pooled_width) % pooled_height;
int ctop = (ix / pooled_width / pooled_height) % output_dim;
int n = ix / pooled_width / pooled_height / output_dim;
const T* offset_bottom_rois = bottom_rois + n * 4;
int roi_batch_ind = roi_batch_id_data[n];
T roi_start_w =
static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
T roi_start_h =
static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
T roi_end_w =
static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
T roi_end_h =
static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
// width and height of roi
T roi_width = std::max(roi_end_w - roi_start_w, T(0.1));
T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
// width and height of each bin
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
// sampling interval in each bin
T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
// obtain offset of roi
int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
int part_w = floor(static_cast<T>(pw) / pooled_width * part_width);
int class_id = ctop / channels_each_class;
T trans_x =
no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
T trans_y = no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
// location of start after adding offset
T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
T sum = 0;
int num_sample = 0;
int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
gw = std::min(std::max(gw, 0), group_width - 1);
gh = std::min(std::max(gh, 0), group_height - 1);
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels) * height * width;
// sampling in each bin
for (int ih = 0; ih < sample_per_part; ih++) {
for (int iw = 0; iw < sample_per_part; iw++) {
T w = wstart + iw * sub_bin_size_w;
T h = hstart + ih * sub_bin_size_h;
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
continue;
}
w = std::min(std::max(w, T(0.)), T(width - 1.));
h = std::min(std::max(h, T(0.)), height - T(1.));
int c = (ctop * group_height + gh) * group_width + gw;
// bilinear interpolation to get value
T val = bilinear_interp(offset_bottom_data + c * height * width, w, h,
width, height);
sum += val;
num_sample++;
}
}
top_data[ix] = num_sample == 0 ? static_cast<T>(0) : sum / num_sample;
top_count[ix] = num_sample;
}
}
template <typename DeviceContext, typename T>
class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("Input");
auto* rois = ctx.Input<LoDTensor>("ROIs");
auto* trans = ctx.Input<Tensor>("Trans");
auto* out = ctx.Output<Tensor>("Output");
out->mutable_data<T>(ctx.GetPlace());
auto* top_count = ctx.Output<Tensor>("TopCount");
top_count->mutable_data<T>(ctx.GetPlace());
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
set_zero(dev_ctx, out, static_cast<T>(0));
set_zero(dev_ctx, top_count, static_cast<T>(0));
const int num_rois = rois->dims()[0];
PADDLE_ENFORCE_EQ(num_rois, out->dims()[0],
"number of rois should be same with number of output");
framework::Tensor roi_batch_id_list;
roi_batch_id_list.Resize({num_rois});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
auto no_trans = ctx.Attr<bool>("no_trans");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto output_dim = ctx.Attr<int>("output_dim");
auto group_size = ctx.Attr<std::vector<int>>("group_size");
auto group_height = group_size[0];
auto group_width = group_size[1];
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto part_size = ctx.Attr<std::vector<int>>("part_size");
auto part_height = part_size[0];
auto part_width = part_size[1];
auto sample_per_part = ctx.Attr<int>("sample_per_part");
auto trans_std = ctx.Attr<float>("trans_std");
int batch = static_cast<int>(input->dims()[0]);
int channels = static_cast<int>(input->dims()[1]);
int height = static_cast<int>(input->dims()[2]);
int width = static_cast<int>(input->dims()[3]);
int channels_trans = no_trans ? 2 : trans->dims()[1];
auto count = num_rois * output_dim * pooled_height * pooled_width;
auto num_classes = no_trans ? 1 : channels_trans / 2;
auto channels_each_class = no_trans ? output_dim : output_dim / num_classes;
PADDLE_ENFORCE(channels_each_class >= 1,
"channels_each must greater than 1");
const T* bottom_data = input->data<T>();
const T* bottom_rois = rois->data<T>();
const T* bottom_trans = no_trans ? NULL : trans->data<T>();
T* top_data = out->mutable_data<T>(ctx.GetPlace());
T* top_count_data = top_count->mutable_data<T>(ctx.GetPlace());
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(rois_batch_size, batch,
"The rois_batch_size must equal to batch_size of img.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
DeformablePSROIPoolForwardCPUKernel(
count, bottom_data, (T)spatial_scale, channels, height, width,
pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
trans_std, sample_per_part, output_dim, group_height, group_width,
part_height, part_width, num_classes, channels_each_class, top_data,
top_count_data, batch, roi_batch_id_data, rois);
}
};
template <typename T>
void DeformablePSROIPoolBackwardAccCPUKernel(
const int count, const T* top_diff, const T* top_count, const int num_rois,
const T spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int output_dim, T* bottom_data_diff, T* bottom_trans_diff,
const T* bottom_data, const T* bottom_rois, const T* bottom_trans,
const bool no_trans, const float trans_std, const int sample_per_part,
const int group_height, const int group_width, const int part_height,
const int part_width, const int num_classes, const int channels_each_class,
const int batch_size, int* roi_batch_id_data, const LoDTensor* rois) {
for (int index = 0; index < count; index++) {
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// location of roi on feature map
const T* offset_bottom_rois = bottom_rois + n * 4;
int roi_batch_ind = roi_batch_id_data[n];
T roi_start_w =
static_cast<T>(round(offset_bottom_rois[0])) * spatial_scale - 0.5;
T roi_start_h =
static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
T roi_end_w =
static_cast<T>(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5;
T roi_end_h =
static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
// width and height of roi
T roi_width = std::max(roi_end_w - roi_start_w, T(0.1));
T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
// width and height of each bin
T bin_size_h = roi_height / static_cast<T>(pooled_height);
T bin_size_w = roi_width / static_cast<T>(pooled_width);
// sampling interval in each bin
T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
// obtain offset of roi
int part_h = floor(static_cast<T>(ph) / pooled_height * part_height);
int part_w = floor(static_cast<T>(pw) / pooled_width * part_height);
int class_id = ctop / channels_each_class;
T trans_x =
no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2) * part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
T trans_y = no_trans
? static_cast<T>(0)
: bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
part_height +
part_h) *
part_width +
part_w] *
static_cast<T>(trans_std);
// location of start after adding offset
T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
wstart += trans_x * roi_width;
T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
hstart += trans_y * roi_height;
if (top_count[index] <= 0) {
continue;
}
T diff_val = top_diff[index] / top_count[index];
const T* offset_bottom_data =
bottom_data + roi_batch_ind * channels * height * width;
int gw = floor(static_cast<T>(pw) * group_width / pooled_width);
int gh = floor(static_cast<T>(ph) * group_height / pooled_height);
gw = std::min(std::max(gw, 0), group_width - 1);
gh = std::min(std::max(gh, 0), group_height - 1);
// sampling in each bin
for (int ih = 0; ih < sample_per_part; ih++) {
for (int iw = 0; iw < sample_per_part; iw++) {
T w = wstart + iw * sub_bin_size_w;
T h = hstart + ih * sub_bin_size_h;
if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
continue;
}
w = std::min(std::max(w, T(0.)), T(width - 1.));
h = std::min(std::max(h, T(0.)), T(height - 1.));
int c = (ctop * group_height + gh) * group_width + gw;
int x0 = floor(w);
int x1 = ceil(w);
int y0 = floor(h);
int y1 = ceil(h);
// compute coefficient of gradient
T dist_x = w - x0, dist_y = h - y0;
T q00 = (1 - dist_x) * (1 - dist_y);
T q01 = (1 - dist_x) * dist_y;
T q10 = dist_x * (1 - dist_y);
T q11 = dist_x * dist_y;
int bottom_index_base = c * height * width;
// compute gradient of input
if (bottom_data_diff != NULL) {
T* offset_bottom_data_diff_addr00 =
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y0 * width + x0;
T* offset_bottom_data_diff_addr01 =
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y1 * width + x0;
T* offset_bottom_data_diff_addr10 =
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y0 * width + x1;
T* offset_bottom_data_diff_addr11 =
bottom_data_diff + roi_batch_ind * channels * height * width +
bottom_index_base + y1 * width + x1;
*offset_bottom_data_diff_addr00 =
*offset_bottom_data_diff_addr00 + q00 * diff_val;
*offset_bottom_data_diff_addr01 =
*offset_bottom_data_diff_addr01 + q01 * diff_val;
*offset_bottom_data_diff_addr10 =
*offset_bottom_data_diff_addr10 + q10 * diff_val;
*offset_bottom_data_diff_addr11 =
*offset_bottom_data_diff_addr11 + q11 * diff_val;
}
// compute gradient of trans
if (no_trans || bottom_trans_diff == NULL) {
continue;
}
T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y -
u00 * (1 - dist_y)) *
trans_std * diff_val;
diff_x *= roi_width;
T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x -
u00 * (1 - dist_x)) *
trans_std * diff_val;
diff_y *= roi_height;
T* offset_bottom_trans_diff_x =
bottom_trans_diff +
(((n * num_classes + class_id) * 2) * part_height + part_h) *
part_width +
part_w;
T* offset_bottom_trans_diff_y =
bottom_trans_diff +
(((n * num_classes + class_id) * 2 + 1) * part_height + part_h) *
part_width +
part_w;
*offset_bottom_trans_diff_x = *offset_bottom_trans_diff_x + diff_x;
*offset_bottom_trans_diff_y = *offset_bottom_trans_diff_y + diff_y;
}
}
}
}
template <typename DeviceContext, typename T>
class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("Input");
auto* rois = ctx.Input<LoDTensor>("ROIs");
auto* trans = ctx.Input<Tensor>("Trans");
auto* top_count = ctx.Input<Tensor>("TopCount");
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
math::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, input_grad, static_cast<T>(.0));
}
auto* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
if (trans_grad) {
trans_grad->mutable_data<T>(ctx.GetPlace());
set_zero(dev_ctx, trans_grad, static_cast<T>(.0));
}
auto no_trans = ctx.Attr<bool>("no_trans");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto output_dim = ctx.Attr<int>("output_dim");
auto group_size = ctx.Attr<std::vector<int>>("group_size");
auto group_height = group_size[0];
auto group_width = group_size[1];
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto part_size = ctx.Attr<std::vector<int>>("part_size");
auto part_height = part_size[0];
auto part_width = part_size[1];
auto sample_per_part = ctx.Attr<int>("sample_per_part");
auto trans_std = ctx.Attr<float>("trans_std");
const int batch = static_cast<int>(input->dims()[0]);
const int channels = static_cast<int>(input->dims()[1]);
const int height = static_cast<int>(input->dims()[2]);
const int width = static_cast<int>(input->dims()[3]);
const int channels_trans = no_trans ? 2 : trans->dims()[1];
const int num_rois = rois->dims()[0];
const int count = num_rois * output_dim * pooled_height * pooled_width;
const int num_classes = no_trans ? 1 : channels_trans / 2;
const int channels_each_class =
no_trans ? output_dim : output_dim / num_classes;
Tensor roi_batch_id_list;
roi_batch_id_list.Resize({num_rois});
int* roi_batch_id_data =
roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
const T* top_diff = output_grad->data<T>();
const T* bottom_data = input->data<T>();
const T* bottom_rois = rois->data<T>();
const T* bottom_trans = no_trans ? NULL : trans->data<T>();
T* bottom_data_diff = NULL;
T* bottom_trans_diff = NULL;
if (input_grad) {
bottom_data_diff = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (trans_grad) {
bottom_trans_diff =
no_trans ? NULL : trans_grad->mutable_data<T>(ctx.GetPlace());
}
const T* top_count_data = top_count->data<T>();
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod,
"The rois_num from input and lod must be the same.");
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n;
}
}
DeformablePSROIPoolBackwardAccCPUKernel(
count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels,
height, width, pooled_height, pooled_width, output_dim,
bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height,
group_width, part_height, part_width, num_classes, channels_each_class,
batch, roi_batch_id_data, rois);
}
};
} // namespace operators
} // namespace paddle
......@@ -204,6 +204,7 @@ __all__ = [
'sign',
'deformable_conv',
'unfold',
'deformable_roi_pooling',
]
kIgnoreIndex = -100
......@@ -12168,3 +12169,117 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
"dilations": dilations
})
return out
def deformable_roi_pooling(input,
rois,
trans,
no_trans=False,
spatial_scale=1.0,
group_size=[1, 1],
pooled_height=1,
pooled_width=1,
part_size=None,
sample_per_part=1,
trans_std=0.1,
position_sensitive=False,
name=None):
"""
Deformable PSROI Pooling Layer
Args:
input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is
[N,C,H,W]. Where N is batch size,C is number of input channels,H
is height of the feature, and W is the width of the feature.
rois (Variable): ROIs (Regions of Interest) to pool over.It should be
a 2-D LoDTensor of shape (num_rois, 4), the lod level
is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
the top left coordinates, and (x2, y2) is the bottom
right coordinates.
trans (Variable): Offset of features on ROIs while pooling.The format is NCHW, where
N is number of ROIs, C is number of channels, which indicate the offset distance
in the x and y directions, H is pooled height, and W is pooled width.
no_trans (bool): Whether to add offset to get new value or not while roi pooling, which
value is True or False. Default: False.
spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width).
Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
group_size (list|tuple): The number of groups which input channels are divided.(eg.number of input channels
is k1*k2*(C+1), which k1 and k2 are group width and height and C+1 is number of output
chanels. eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
pooled_height (integer): The pooled output height. Default: 1.
pooled_width (integer): The pooled output width. Default: 1.
part_size (list|tuple): The height and width of offset, eg.(4, 6), which height is 4 and width is 6, Default:
if None, default value is [pooled_height, pooled_width].
sample_per_part (integer): The number of samples in each bin. Default: 1.
trans_std (float): Coefficient of offset. Default: 0.1.
position_sensitive (bool): Whether to choose deformable psroi pooling mode or not. Default: False.
name (str): Name of layer. Default: None.
Returns:
Variable: The tensor variable storing the deformable psroi pooling \
result.
Examples:
.. code-block:: python
input = fluid.layers.data(name="input",
shape=[2, 192, 64, 64],
dtype='float32',
append_batch_size=False)
rois = fluid.layers.data(name="rois",
shape=[4],
dtype='float32',
lod_level=1)
trans = fluid.layers.data(name="trans",
shape=[2, 384, 64, 64],
dtype='float32',
append_batch_size=False)
x = fluid.layers.nn.deformable_roi_pooling(input=input,
rois=rois,
trans=trans,
no_trans=False,
spatial_scale=1.0,
group_size=(1, 1),
pooled_height=8,
pooled_width=8,
part_size=(8, 8),
sample_per_part=4,
trans_std=0.1,
position_sensitive=False)
"""
input_channels = input.shape[1]
if position_sensitive == False:
output_channels = input_channels
else:
output_channels = input_channels / pooled_height / pooled_width
if part_size is None:
part_height = pooled_height
part_width = pooled_width
part_size = [part_height, part_width]
part_size = utils.convert_to_list(part_size, 2, 'part_size')
group_size = utils.convert_to_list(group_size, 2, 'group_size')
helper = LayerHelper('deformable_psroi_pooling', **locals())
dtype = helper.input_dtype()
output = helper.create_variable_for_type_inference(dtype)
top_count = helper.create_variable_for_type_inference(dtype='int32')
helper.append_op(
type="deformable_psroi_pooling",
inputs={"Input": input,
"ROIs": rois,
"Trans": trans},
outputs={"Output": output,
"TopCount": top_count},
attrs={
"no_trans": no_trans,
"spatial_scale": spatial_scale,
"output_dim": output_channels,
"group_size": group_size,
"pooled_height": pooled_height,
"pooled_width": pooled_width,
"part_size": part_size,
"sample_per_part": sample_per_part,
"trans_std": trans_std
})
return output
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
def set_input(input, rois, trans):
inputs = {'Input': input, "ROIs": rois, "Trans": trans}
return inputs
def set_attrs(no_trans, spatial_scale, output_channels, group_size,
pooled_height, pooled_width, part_size, sample_per_part,
trans_std):
attrs = {
'no_trans': no_trans,
'spatial_scale': spatial_scale,
'output_dim': output_channels,
'group_size': group_size,
'pooled_height': pooled_height,
'pooled_width': pooled_width,
'part_size': part_size,
'sample_per_part': sample_per_part,
'trans_std': trans_std
}
return attrs
def set_outputs(output, top_count):
outputs = {
'Output': output.astype('float32'),
'TopCount': top_count.astype('float32')
}
return outputs
class TestDeformablePSROIPoolOp(OpTest):
def set_data(self):
self.start_test1()
self.start_test2()
self.start_test3()
self.start_test4()
def start_test1(self):
self.init_test_case1()
self.make_rois()
self.calc_deformable_psroi_pooling()
inputs = self.input
rois = (self.rois[:, 1:5], self.rois_lod)
trans = self.trans
self.inputs = set_input(inputs, rois, trans)
no_trans = self.no_trans
spatial_scale = self.spatial_scale
output_channels = self.output_channels
group_size = self.group_size
pooled_height = self.pooled_height
pooled_width = self.pooled_width
part_size = self.part_size
sample_per_part = self.sample_per_part
trans_std = self.trans_std
self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
group_size, pooled_height, pooled_width,
part_size, sample_per_part, trans_std)
output = self.out.astype('float32')
top_count = self.top_count.astype('float32')
self.outputs = set_outputs(output, top_count)
def start_test2(self):
self.init_test_case2()
self.make_rois()
self.calc_deformable_psroi_pooling()
inputs = self.input
rois = (self.rois[:, 1:5], self.rois_lod)
trans = self.trans
self.inputs = set_input(inputs, rois, trans)
no_trans = self.no_trans
spatial_scale = self.spatial_scale
output_channels = self.output_channels
group_size = self.group_size
pooled_height = self.pooled_height
pooled_width = self.pooled_width
part_size = self.part_size
sample_per_part = self.sample_per_part
trans_std = self.trans_std
self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
group_size, pooled_height, pooled_width,
part_size, sample_per_part, trans_std)
output = self.out.astype('float32')
top_count = self.top_count.astype('float32')
self.outputs = set_outputs(output, top_count)
def start_test3(self):
self.init_test_case3()
self.make_rois()
self.calc_deformable_psroi_pooling()
inputs = self.input
rois = (self.rois[:, 1:5], self.rois_lod)
trans = self.trans
self.inputs = set_input(inputs, rois, trans)
no_trans = self.no_trans
spatial_scale = self.spatial_scale
output_channels = self.output_channels
group_size = self.group_size
pooled_height = self.pooled_height
pooled_width = self.pooled_width
part_size = self.part_size
sample_per_part = self.sample_per_part
trans_std = self.trans_std
self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
group_size, pooled_height, pooled_width,
part_size, sample_per_part, trans_std)
output = self.out.astype('float32')
top_count = self.top_count.astype('float32')
self.outputs = set_outputs(output, top_count)
def start_test4(self):
self.init_test_case4()
self.make_rois()
self.calc_deformable_psroi_pooling()
inputs = self.input
rois = (self.rois[:, 1:5], self.rois_lod)
trans = self.trans
self.inputs = set_input(inputs, rois, trans)
no_trans = self.no_trans
spatial_scale = self.spatial_scale
output_channels = self.output_channels
group_size = self.group_size
pooled_height = self.pooled_height
pooled_width = self.pooled_width
part_size = self.part_size
sample_per_part = self.sample_per_part
trans_std = self.trans_std
self.attrs = set_attrs(no_trans, spatial_scale, output_channels,
group_size, pooled_height, pooled_width,
part_size, sample_per_part, trans_std)
output = self.out.astype('float32')
top_count = self.top_count.astype('float32')
self.outputs = set_outputs(output, top_count)
def init_test_case1(self):
self.batch_size = 3
self.channels = 3 * 2 * 2
self.height = 12
self.width = 12
self.input_dim = [
self.batch_size, self.channels, self.height, self.width
]
self.no_trans = False
self.spatial_scale = 1.0 / 4.0
self.output_channels = 12
self.group_size = [1, 1]
self.pooled_height = 4
self.pooled_width = 4
self.part_size = [4, 4]
self.sample_per_part = 2
self.trans_std = 0.1
self.input = np.random.random(self.input_dim).astype('float32')
def init_test_case2(self):
self.batch_size = 2
self.channels = 3 * 2 * 2
self.height = 12
self.width = 12
self.input_dim = [
self.batch_size, self.channels, self.height, self.width
]
self.no_trans = True
self.spatial_scale = 1.0 / 2.0
self.output_channels = 12
self.group_size = [1, 1]
self.pooled_height = 7
self.pooled_width = 7
self.part_size = [7, 7]
self.sample_per_part = 4
self.trans_std = 0.1
self.input = np.random.random(self.input_dim).astype('float32')
def init_test_case3(self):
self.batch_size = 2
self.channels = 3 * 2 * 2
self.height = 12
self.width = 12
self.input_dim = [
self.batch_size, self.channels, self.height, self.width
]
self.no_trans = False
self.spatial_scale = 1.0 / 4.0
self.output_channels = 12
self.group_size = [1, 1]
self.pooled_height = 3
self.pooled_width = 3
self.part_size = [3, 3]
self.sample_per_part = 3
self.trans_std = 0.2
self.input = np.random.random(self.input_dim).astype('float32')
def init_test_case4(self):
self.batch_size = 2
self.channels = 3 * 2 * 2
self.height = 12
self.width = 12
self.input_dim = [
self.batch_size, self.channels, self.height, self.width
]
self.no_trans = True
self.spatial_scale = 1.0 / 2.0
self.output_channels = 12
self.group_size = [1, 1]
self.pooled_height = 6
self.pooled_width = 2
self.part_size = [6, 6]
self.sample_per_part = 6
self.trans_std = 0.4
self.input = np.random.random(self.input_dim).astype('float32')
def make_rois(self):
rois = []
self.rois_lod = [[]]
for bno in range(self.batch_size):
self.rois_lod[0].append(bno + 1)
for i in range(bno + 1):
x_1 = np.random.random_integers(
0, self.width // self.spatial_scale - self.pooled_width)
y_1 = np.random.random_integers(
0, self.height // self.spatial_scale - self.pooled_height)
x_2 = np.random.random_integers(
x_1 + self.pooled_width, self.width // self.spatial_scale)
y_2 = np.random.random_integers(
y_1 + self.pooled_height, self.height // self.spatial_scale)
roi = [bno, x_1, y_1, x_2, y_2]
rois.append(roi)
self.rois_num = len(rois)
self.rois = np.array(rois).astype("float32")
def dmc_bilinear(self, data_im, p_h, p_w):
h_low = int(np.floor(p_h))
w_low = int(np.floor(p_w))
h_high = h_low + 1
w_high = w_low + 1
l_h = p_h - h_low
l_w = p_w - w_low
h_h = 1 - l_h
h_w = 1 - l_w
v_1 = 0
if h_low >= 0 and w_low >= 0:
v_1 = data_im[h_low, w_low]
v_2 = 0
if h_low >= 0 and w_high <= self.width - 1:
v_2 = data_im[h_low, w_high]
v_3 = 0
if h_high <= self.height - 1 and w_low >= 0:
v_3 = data_im[h_high, w_low]
v_4 = 0
if h_high <= self.height - 1 and w_high <= self.width - 1:
v_4 = data_im[h_high, w_high]
w_1, w_2, w_3, w_4 = h_h * h_w, h_h * l_w, l_h * h_w, l_h * l_w
val = w_1 * v_1 + w_2 * v_2 + w_3 * v_3 + w_4 * v_4
return val
def calc_deformable_psroi_pooling(self):
output_shape = (self.rois_num, self.output_channels, self.pooled_height,
self.pooled_width)
self.out = np.zeros(output_shape)
self.trans = np.random.rand(self.rois_num, 2, self.part_size[0],
self.part_size[1]).astype('float32')
self.top_count = np.random.random((output_shape)).astype('float32')
count = self.rois_num * self.output_channels * self.pooled_height * self.pooled_width
for index in range(count):
p_w = int(index % self.pooled_width)
p_h = int(index / self.pooled_width % self.pooled_height)
ctop = int(index / self.pooled_width / self.pooled_height %
self.output_channels)
n_out = int(index / self.pooled_width / self.pooled_height /
self.output_channels)
roi = self.rois[n_out]
roi_batch_id = int(roi[0])
roi_start_w = int(np.round(roi[1])) * self.spatial_scale - 0.5
roi_start_h = int(np.round(roi[2])) * self.spatial_scale - 0.5
roi_end_w = int(np.round(roi[3] + 1)) * self.spatial_scale - 0.5
roi_end_h = int(np.round(roi[4] + 1)) * self.spatial_scale - 0.5
roi_width = max(roi_end_w - roi_start_w, 0.1)
roi_height = max(roi_end_h - roi_start_h, 0.1)
bin_size_h = float(roi_height) / float(self.pooled_height)
bin_size_w = float(roi_width) / float(self.pooled_width)
sub_bin_size_h = bin_size_h / self.sample_per_part
sub_bin_size_w = bin_size_w / self.sample_per_part
part_h = int(np.floor(p_h) / self.pooled_height * self.part_size[0])
part_w = int(np.floor(p_w) / self.pooled_width * self.part_size[1])
if self.no_trans:
trans_x = 0
trans_y = 0
else:
trans_x = self.trans[n_out][0][part_h][part_w] * self.trans_std
trans_y = self.trans[n_out][1][part_h][part_w] * self.trans_std
wstart = p_w * bin_size_w + roi_start_w
wstart = wstart + trans_x * roi_width
hstart = p_h * bin_size_h + roi_start_h
hstart = hstart + trans_y * roi_height
sum = 0
num_sample = 0
g_w = np.floor(p_w * self.group_size[0] / self.pooled_height)
g_h = np.floor(p_h * self.group_size[1] / self.pooled_width)
g_w = min(max(g_w, 0), self.group_size[0] - 1)
g_h = min(max(g_h, 0), self.group_size[1] - 1)
input_i = self.input[roi_batch_id]
for i_w in range(self.sample_per_part):
for i_h in range(self.sample_per_part):
w_sample = wstart + i_w * sub_bin_size_w
h_sample = hstart + i_h * sub_bin_size_h
if w_sample < -0.5 or w_sample > self.width - 0.5 or \
h_sample < -0.5 or h_sample > self.height - 0.5:
continue
w_sample = min(max(w_sample, 0.), self.width - 1.)
h_sample = min(max(h_sample, 0.), self.height - 1.)
c_sample = int((ctop * self.group_size[0] + g_h) *
self.group_size[1] + g_w)
val = self.dmc_bilinear(input_i[c_sample], h_sample,
w_sample)
sum = sum + val
num_sample = num_sample + 1
if num_sample == 0:
self.out[n_out][ctop][p_h][p_w] = 0
else:
self.out[n_out][ctop][p_h][p_w] = sum / num_sample
self.top_count[n_out][ctop][p_h][p_w] = num_sample
def setUp(self):
self.op_type = "deformable_psroi_pooling"
self.set_data()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Input'], 'Output')
if __name__ == '__main__':
unittest.main()
......@@ -1995,6 +1995,35 @@ class TestBook(LayerTest):
out = layers.unfold(x, [3, 3], 1, 1, 1)
return (out)
def test_deform_roi_pooling(self):
with program_guard(fluid.default_main_program(),
fluid.default_startup_program()):
input = layers.data(
name='input',
shape=[2, 3, 32, 32],
dtype='float32',
append_batch_size=False)
rois = layers.data(
name="rois", shape=[4], dtype='float32', lod_level=1)
trans = layers.data(
name="trans",
shape=[2, 3, 32, 32],
dtype='float32',
append_batch_size=False)
out = layers.deformable_roi_pooling(
input=input,
rois=rois,
trans=trans,
no_trans=False,
spatial_scale=1.0,
group_size=(1, 1),
pooled_height=8,
pooled_width=8,
part_size=(8, 8),
sample_per_part=4,
trans_std=0.1)
return (out)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册