From 7960928883feb29dbc51b9a01fde45822d6f9468 Mon Sep 17 00:00:00 2001 From: wanghaox Date: Wed, 22 Nov 2017 16:37:08 +0800 Subject: [PATCH] add roi pool operator --- paddle/operators/roi_pool_op.cc | 126 +++++++++++++++ paddle/operators/roi_pool_op.cu | 265 ++++++++++++++++++++++++++++++++ paddle/operators/roi_pool_op.h | 213 +++++++++++++++++++++++++ 3 files changed, 604 insertions(+) create mode 100755 paddle/operators/roi_pool_op.cc create mode 100755 paddle/operators/roi_pool_op.cu create mode 100755 paddle/operators/roi_pool_op.h diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc new file mode 100755 index 00000000000..902c351af16 --- /dev/null +++ b/paddle/operators/roi_pool_op.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/roi_pool_op.h" + +namespace paddle { +namespace operators { + +class RoiPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of RoiPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Rois"), + "Input(Rois) of RoiPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of RoiPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Argmax"), + "Output(Argmax) of RoiPoolOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + + // Initialize the output's dims to maximum, + // and re-set to real dims by the value of Rois at kernel + ctx->SetOutputDim("Out", input_dims); + } + + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class RoiPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + +class RoiPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RoiPoolOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor), " + "the input of RoiPoolOp."); + AddInput("Rois", + "(Tensor), " + "RoIs (Regions of Interest) to pool over. " + "Should be a 2-D tensor of shape (num_rois, 5)" + "given as [[batch_id, x1, y1, x2, y2], …]."); + AddOutput("Out", + "(Tensor), " + "RoI pooled output 4-D tensor of shape " + "(num_rois, channels, pooled_h, pooled_w)."); + AddOutput("Argmax", + "(Tensor), " + "Argmaxes corresponding to indices in X used " + "for gradient computation. Only output " + "if arg “is_test” is false.").AsIntermediate(); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "The pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "The pooled output width.") + .SetDefault(1); + AddComment(R"DOC( +RoiPool operator + +ROI Pooling for Faster-RCNN. The link below is a further introduction: +https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(roi_pool, ops::RoiPoolOp, ops::RoiPoolOpMaker, + roi_pool_grad, ops::RoiPoolGradOp); +REGISTER_OP_CPU_KERNEL( + roi_pool, + ops::CPURoiPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + roi_pool_grad, + ops::CPURoiPoolGradOpKernel); diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu new file mode 100755 index 00000000000..62c05307ca4 --- /dev/null +++ b/paddle/operators/roi_pool_op.cu @@ -0,0 +1,265 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/cuda_helper.h" +#include "paddle/operators/roi_pool_op.h" + +namespace paddle { +namespace operators { + +#define FLT_MAX __FLT_MAX__ + +constexpr int PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS = 512; +constexpr int PADDLE_OPERATORS_ROIPOOL_MAXIMUM_NUM_BLOCKS = 4096; + +inline int PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(const int N) { + return std::min((N + PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS - 1) + / PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS, + PADDLE_OPERATORS_ROIPOOL_MAXIMUM_NUM_BLOCKS); +} + +template +__global__ void GPURoiPoolForward( + const int nthreads, + const T* input_data, + const int64_t* input_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + T* output_data, + int64_t* argmax_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const int64_t* offset_input_rois = input_rois + n * 5; + int roi_batch_ind = offset_input_rois[0]; + int roi_start_w = round(offset_input_rois[1] * spatial_scale); + int roi_start_h = round(offset_input_rois[2] * spatial_scale); + int roi_end_w = round(offset_input_rois[3] * spatial_scale); + int roi_end_h = round(offset_input_rois[4] * spatial_scale); + + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + T bin_size_h = static_cast(roi_height) + / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) + / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + T maxval = is_empty ? 0 : -FLT_MAX; + int maxidx = -1; + const T* offset_input_data = + input_data + (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_data_index = h * width + w; + if (offset_input_data[input_data_index] > maxval) { + maxval = offset_input_data[input_data_index]; + maxidx = input_data_index; + } + } + } + output_data[index] = maxval; + if (argmax_data) { + argmax_data[index] = maxidx; + } + } + } + +template +__global__ void GPURoiPoolBackward( + const int nthreads, + const int64_t* input_rois, + const T* output_grad, + const int64_t* argmax_data, + const int num_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + T* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const int64_t* offset_input_rois = input_rois + n * 5; + int roi_batch_ind = offset_input_rois[0]; + int input_offset = (roi_batch_ind * channels + c) * height * width; + int output_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_output_grad = output_grad + output_offset; + T* offset_input_grad = input_grad + input_offset; + const int64_t* offset_argmax_data = argmax_data + output_offset; + + int argmax = offset_argmax_data[ph * pooled_width + pw]; + if (argmax != -1) { + platform::CudaAtomicAdd(offset_input_grad + argmax, + static_cast(offset_output_grad[ph * pooled_width + pw])); + } + } + } + + +template +class GPURoiPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("Rois"); + auto* out = ctx.Output("Out"); + auto* argmax = ctx.Output("Argmax"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0, + "The spatial scale must greater than 0"); + + auto in_dims = in->dims(); + auto in_stride = framework::stride(in_dims); + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + int rois_num = rois->dims()[0]; + auto out_dims = in_dims; + out_dims[0] = rois_num; + out_dims[1] = in_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->Resize(out_dims); + out->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.device_context(), out, static_cast(0)); + argmax->Resize(out->dims()); + argmax->mutable_data(ctx.GetPlace()); + math::SetConstant set_init; + set_init(ctx.device_context(), argmax, static_cast(-1)); + + if (rois_num== 0) return; + + int output_size = out->numel(); + int blocks = PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(output_size); + int threads = PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS; + + GPURoiPoolForward + <<>>( + output_size, + in->data(), + rois->data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + out->mutable_data(ctx.GetPlace()), + argmax->mutable_data(ctx.GetPlace())); + + return; + } +}; + +template +class GPURoiPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("Rois"); + auto* argmax = ctx.Input("Argmax"); + + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + int rois_num = rois->dims()[0]; + int channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (x_grad) { + x_grad->Resize(in->dims()); + x_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.device_context(), x_grad, static_cast(0)); + + int output_grad_size = out_grad->numel(); + int blocks = PADDLE_OPERATORS_ROIPOOL_GET_BLOCKS(output_grad_size); + int threads = PADDLE_OPERATORS_ROIPOOL_CUDA_NUM_THREADS; + + if (output_grad_size > 0) { + GPURoiPoolBackward + <<>>( + output_grad_size, + rois->data(), + out_grad->data(), + argmax->data(), + rois_num, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + x_grad->mutable_data(ctx.GetPlace())); + } + return; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL( + roi_pool, + ops::GPURoiPoolOpKernel); +REGISTER_OP_GPU_KERNEL( + roi_pool_grad, + ops::GPURoiPoolGradOpKernel); diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h new file mode 100755 index 00000000000..694677009fb --- /dev/null +++ b/paddle/operators/roi_pool_op.h @@ -0,0 +1,213 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +class CPURoiPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("Rois"); + auto* out = ctx.Output("Out"); + auto* argmax = ctx.Output("Argmax"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0, + "The spatial scale must greater than 0"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto out_dims = in_dims; + out_dims[0] = rois_num; + out_dims[1] = channels; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + out->Resize(out_dims); + argmax->Resize(out->dims()); + + auto in_stride = framework::stride(in_dims); + auto argmax_stride = framework::stride(argmax->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out_dims); + + const T* input_data = in->data(); + const int64_t* rois_data = rois->data(); + T* output_data = out->mutable_data(ctx.GetPlace()); + int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); + + math::SetConstant set_zero; + set_zero(ctx.device_context(), out, static_cast(0)); + math::SetConstant set_init; + set_init(ctx.device_context(), argmax, static_cast(-1)); + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = rois_data[0]; + PADDLE_ENFORCE_GE(roi_batch_id, 0); + PADDLE_ENFORCE_LT(roi_batch_id, batch_size); + rois_data += roi_stride[0]; + } + + rois_data = rois->data(); + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = rois_data[0]; + int roi_start_w = round(rois_data[1] * spatial_scale); + int roi_start_h = round(rois_data[2] * spatial_scale); + int roi_end_w = round(rois_data[3] * spatial_scale); + int roi_end_h = round(rois_data[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_height = std::max(roi_end_h - roi_start_h + 1, 1); + int roi_width = std::max(roi_end_w - roi_start_w + 1, 1); + + const float bin_size_h = + static_cast(roi_height) / static_cast(pooled_height); + const float bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + const float* batch_data = input_data + roi_batch_id * in_stride[0]; + + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + // Compute pooling region for this output unit: + // start (included) = floor(ph * roi_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) + int hstart = + static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = + static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = + static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = + static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = std::min(std::max(hstart + roi_start_h, 0), height); + hend = std::min(std::max(hend + roi_start_h, 0), height); + wstart = std::min(std::max(wstart + roi_start_w, 0), width); + wend = std::min(std::max(wend + roi_start_w, 0), width); + + const int pool_index = ph * pooled_width + pw; + + // Define an empty pooling region to be zero + bool is_empty = (hend <= hstart) || (wend <= wstart); + output_data[pool_index] = is_empty ? 0 : -__FLT_MAX__; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width + w; + if (batch_data[index] > output_data[pool_index]) { + output_data[pool_index] = batch_data[index]; + argmax_data[pool_index] = index; + } + } + } + } + } + + batch_data += in_stride[1]; + output_data += out_stride[1]; + argmax_data += argmax_stride[1]; + } + // Increment ROI data pointer + rois_data += roi_stride[0]; + } + return; + } +}; + +template +class CPURoiPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("Rois"); + auto* argmax = ctx.Input("Argmax"); + + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + + if (x_grad) { + int channels = in->dims()[1]; + auto in_stride = framework::stride(in->dims()); + auto roi_stride = framework::stride(rois->dims()); + + const int64_t* rois_data = rois->data(); + int rois_num = rois->dims()[0]; + + T* x_grad_data = x_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.device_context(), x_grad, static_cast(0)); + + size_t roi_offset = roi_stride[0]; + size_t batch_offset = in_stride[0]; + size_t channel_offset = in_stride[1]; + + const T* out_grad_data = out_grad->data(); + size_t pool_channel_offset = pooled_height * pooled_width; + const int64_t* argmax_data = argmax->data(); + + for (size_t n = 0; n < rois_num; ++n) { + size_t roi_batch_idx = rois_data[0]; + T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx; + for (size_t c = 0; c < channels; ++c) { + for (size_t ph = 0; ph < pooled_height; ++ph) { + for (size_t pw = 0; pw < pooled_width; ++pw) { + size_t pool_index = ph * pooled_width + pw; + + if (argmax_data[pool_index] >= 0) { + size_t index = static_cast(argmax_data[pool_index]); + batch_grad_data[index] += out_grad_data[pool_index]; + } + } + } + batch_grad_data += channel_offset; + out_grad_data += pool_channel_offset; + argmax_data += pool_channel_offset; + } + rois_data += roi_offset; + } + } + } +}; + +} // namespace operators +} // namespace paddle -- GitLab