From 0f4c7332969bdb057f855cd4a37174f3c06de281 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 20 Jul 2017 12:03:23 +0800 Subject: [PATCH] add ROIPooling for Fast(er) R-CNN --- paddle/gserver/layers/ROIPoolLayer.cpp | 154 ++++++++++++++++++ paddle/gserver/layers/ROIPoolLayer.h | 53 ++++++ paddle/gserver/tests/test_LayerGrad.cpp | 34 ++++ proto/ModelConfig.proto | 9 + python/paddle/trainer/config_parser.py | 11 ++ .../paddle/trainer_config_helpers/layers.py | 37 +++++ 6 files changed, 298 insertions(+) create mode 100644 paddle/gserver/layers/ROIPoolLayer.cpp create mode 100644 paddle/gserver/layers/ROIPoolLayer.h diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp new file mode 100644 index 00000000000..04763fd152f --- /dev/null +++ b/paddle/gserver/layers/ROIPoolLayer.cpp @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "ROIPoolLayer.h" + +namespace paddle { + +REGISTER_LAYER(roi_pool, ROIPoolLayer); + +bool ROIPoolLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + Layer::init(layerMap, parameterMap); + + const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf(); + pooledWidth_ = layerConf.pooled_width(); + pooledHeight_ = layerConf.pooled_height(); + spatialScale_ = layerConf.spatial_scale(); + + return true; +} + +void ROIPoolLayer::forward(PassType passType) { + Layer::forward(passType); + + const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf(); + height_ = getInput(0).getFrameHeight(); + if (!height_) height_ = layerConf.height(); + width_ = getInput(0).getFrameWidth(); + if (!width_) width_ = layerConf.width(); + channels_ = getInputValue(0)->getWidth() / width_ / height_; + + size_t batchSize = getInput(0).getBatchSize(); + size_t numROIs = getInput(1).getBatchSize(); + + real* bottomData = getInputValue(0)->getData(); + size_t batchOffset = getInputValue(0)->getWidth(); + size_t channelOffset = height_ * width_; + real* bottomROIs = getInputValue(1)->getData(); + size_t roiOffset = getInputValue(1)->getWidth(); + size_t poolChannelOffset = pooledHeight_ * pooledWidth_; + + resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_); + real* outputData = getOutputValue()->getData(); + Matrix::resizeOrCreate(maxIdxs_, + numROIs, + channels_ * pooledHeight_ * pooledWidth_, + false, + false); + real* argmaxData = maxIdxs_->getData(); + + size_t uZero = 0; + size_t uOne = 1; + + for (size_t n = 0; n < numROIs; ++n) { + size_t roiBatchIdx = bottomROIs[0]; + size_t roiStartW = std::round(bottomROIs[1] * spatialScale_); + size_t roiStartH = std::round(bottomROIs[2] * spatialScale_); + size_t roiEndW = std::round(bottomROIs[3] * spatialScale_); + size_t roiEndH = std::round(bottomROIs[4] * spatialScale_); + CHECK_GE(roiBatchIdx, 0); + CHECK_LT(roiBatchIdx, batchSize); + size_t roiHeight = std::max(roiEndH - roiStartH + 1, uOne); + size_t roiWidth = std::max(roiEndW - roiStartW + 1, uOne); + real binSizeH = + static_cast(roiHeight) / static_cast(pooledHeight_); + real binSizeW = + static_cast(roiWidth) / static_cast(pooledWidth_); + real* batchData = bottomData + batchOffset * roiBatchIdx; + for (size_t c = 0; c < channels_; ++c) { + for (size_t ph = 0; ph < pooledHeight_; ++ph) { + for (size_t pw = 0; pw < pooledWidth_; ++pw) { + size_t hstart = static_cast(std::floor(ph * binSizeH)); + size_t wstart = static_cast(std::floor(pw * binSizeW)); + size_t hend = static_cast(std::ceil((ph + 1) * binSizeH)); + size_t wend = static_cast(std::ceil((pw + 1) * binSizeW)); + hstart = std::min(std::max(hstart + roiStartH, uZero), height_); + wstart = std::min(std::max(wstart + roiStartW, uZero), width_); + hend = std::min(std::max(hend + roiStartH, uZero), height_); + wend = std::min(std::max(wend + roiStartW, uZero), width_); + + bool isEmpty = (hend <= hstart) || (wend <= wstart); + size_t poolIndex = ph * pooledWidth_ + pw; + if (isEmpty) { + outputData[poolIndex] = 0; + argmaxData[poolIndex] = -1; + } + + for (size_t h = hstart; h < hend; ++h) { + for (size_t w = wstart; w < wend; ++w) { + size_t index = h * width_ + w; + if (batchData[index] > outputData[poolIndex]) { + outputData[poolIndex] = batchData[index]; + argmaxData[poolIndex] = index; + } + } + } + } + } + batchData += channelOffset; + outputData += poolChannelOffset; + argmaxData += poolChannelOffset; + } + bottomROIs += roiOffset; + } +} + +void ROIPoolLayer::backward(const UpdateCallback& callback) { + real* bottomROIs = getInputValue(1)->getData(); + size_t numROIs = getInput(1).getBatchSize(); + size_t roiOffset = getInputValue(1)->getWidth(); + + MatrixPtr inGrad = getInputGrad(0); + real* inDiffData = inGrad->getData(); + size_t batchOffset = getInputValue(0)->getWidth(); + size_t channelOffset = height_ * width_; + + MatrixPtr outGrad = getOutputGrad(); + real* outDiffData = outGrad->getData(); + size_t poolChannelOffset = pooledHeight_ * pooledWidth_; + real* argmaxData = maxIdxs_->getData(); + + for (size_t n = 0; n < numROIs; ++n) { + size_t roiBatchIdx = bottomROIs[0]; + real* batchDiffData = inDiffData + batchOffset * roiBatchIdx; + for (size_t c = 0; c < channels_; ++c) { + for (size_t ph = 0; ph < pooledHeight_; ++ph) { + for (size_t pw = 0; pw < pooledWidth_; ++pw) { + size_t poolIndex = ph * pooledWidth_ + pw; + if (argmaxData[poolIndex] > 0) { + size_t index = static_cast(argmaxData[poolIndex]); + batchDiffData[index] += outDiffData[poolIndex]; + } + } + } + batchDiffData += channelOffset; + outDiffData += poolChannelOffset; + argmaxData += poolChannelOffset; + } + bottomROIs += roiOffset; + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h new file mode 100644 index 00000000000..ca412d2845f --- /dev/null +++ b/paddle/gserver/layers/ROIPoolLayer.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { + +/** + * A layer used by Fast R-CNN to extract feature maps of ROIs from the last + * feature map. + * - Input: This layer needs two input layers: The first input layer is a + * convolution layer; The second input layer contains the ROI data which is the + * output of ProposalLayer in Faster R-CNN. layers for generating bbox + * location offset and the classification confidence. - Output: The + * ROIs' feature map. Reference: Shaoqing Ren, Kaiming He, Ross Girshick, and + * Jian Sun. Faster R-CNN: Towards Real-Time Object Detection with Region + * Proposal + */ + +class ROIPoolLayer : public Layer { +protected: + size_t channels_; + size_t width_; + size_t height_; + size_t pooledWidth_; + size_t pooledHeight_; + real spatialScale_; + + MatrixPtr maxIdxs_; + +public: + explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 9af083468c0..77feb6d4c9e 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1830,6 +1830,40 @@ TEST(Layer, CropLayer) { } } +TEST(Layer, roi_pool) { + TestConfig config; + config.layerConfig.set_type("roi_pool"); + config.biasSize = 0; + LayerInputConfig* input = config.layerConfig.add_inputs(); + ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf(); + roiPoolConf->set_pooled_width(7); + roiPoolConf->set_pooled_height(7); + roiPoolConf->set_spatial_scale(1. / 16); + roiPoolConf->set_width(14); + roiPoolConf->set_height(14); + + MatrixPtr roiValue = Matrix::create(10, 10, false, false); + roiValue->zeroMem(); + real* roiData = roiValue->getData(); + for (size_t i = 0; i < roiValue->getElementCnt() / 5; ++i) { + *roiData++ = std::rand() % 2; + *roiData++ = std::rand() % 224; + *roiData++ = std::rand() % 224; + size_t xMin = static_cast(*(roiData - 2)); + size_t yMin = static_cast(*(roiData - 1)); + *roiData++ = xMin + std::rand() % (224 - xMin); + *roiData++ = yMin + std::rand() % (224 - yMin); + } + + config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}}); + config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}}); + config.layerConfig.add_inputs(); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "roi_pool", 5, false, useGpu, false); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 83f72c137bd..275723272b3 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -289,6 +289,14 @@ message DetectionOutputConfig { optional uint32 width = 9 [default = 1]; } +message ROIPoolConfig { + required uint32 pooled_width = 1; + required uint32 pooled_height = 2; + required float spatial_scale = 3; + optional uint32 height = 4 [default = 1]; + optional uint32 width = 5 [default = 1]; +} + message LayerInputConfig { required string input_layer_name = 1; optional string input_parameter_name = 2; @@ -309,6 +317,7 @@ message LayerInputConfig { optional RowConvConfig row_conv_conf = 15; optional MultiBoxLossConfig multibox_loss_conf = 16; optional DetectionOutputConfig detection_output_conf = 17; + optional ROIPoolConfig roi_pool_conf = 18; } message LayerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index ab81e67579e..bfb9dd7f1d4 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1732,6 +1732,17 @@ class DetectionOutputLayer(LayerBase): self.config.size = size +@config_layer('roi_pool') +class ROIPoolLayer(LayerBase): + def __init__(self, name, inputs, pooled_width, pooled_height, + spatial_scale): + super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs) + config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs') + self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width + self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height + self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale + + @config_layer('data') class DataLayer(LayerBase): def __init__(self, name, size, height=None, width=None, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index fdb6f83f2ba..c1bdeb68088 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -117,6 +117,7 @@ __all__ = [ 'cross_channel_norm_layer', 'multibox_loss_layer', 'detection_output_layer', + 'roi_pool_layer', 'spp_layer', 'pad_layer', 'eos_layer', @@ -201,6 +202,7 @@ class LayerType(object): PRIORBOX_LAYER = 'priorbox' MULTIBOX_LOSS_LAYER = 'multibox_loss' DETECTION_OUTPUT_LAYER = 'detection_output' + ROI_POOL_LAYER = 'roi_pool' CTC_LAYER = 'ctc' WARP_CTC_LAYER = 'warp_ctc' @@ -1200,6 +1202,41 @@ def detection_output_layer(input_loc, name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size) +@wrap_name_default("roi_pool") +def roi_pool_layer(input, + rois, + pooled_width, + pooled_height, + spatial_scale, + name=None): + """ + A layer used by Fast R-CNN to extract feature maps of ROIs from the last + feature map. + + :param name: The Layer Name. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput. + :param rois: The input ROIs' data. + :type rois: LayerOutput. + :param pooled_width: The width after pooling. + :type pooled_width: int + :param pooled_height: The height after pooling. + :type pooled_height: int + :param spatial_scale: The spatial scale between the image and feature map. + :type spatial_scale: float + :return: LayerOutput + """ + Layer( + name=name, + type=LayerType.ROI_POOL_LAYER, + inputs=[input.name, rois.name], + pooled_width=pooled_width, + pooled_height=pooled_height, + spatial_scale=spatial_scale) + return LayerOutput(name, LayerType.ROI_POOL_LAYER, parents=[input, rois]) + + @wrap_name_default("cross_channel_norm") def cross_channel_norm_layer(input, name=None, param_attr=None): """ -- GitLab