提交 0f4c7332 编写于 作者: G guosheng

add ROIPooling for Fast(er) R-CNN

上级 a98346f4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ROIPoolLayer.h"
namespace paddle {
REGISTER_LAYER(roi_pool, ROIPoolLayer);
bool ROIPoolLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
pooledWidth_ = layerConf.pooled_width();
pooledHeight_ = layerConf.pooled_height();
spatialScale_ = layerConf.spatial_scale();
return true;
}
void ROIPoolLayer::forward(PassType passType) {
Layer::forward(passType);
const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
height_ = getInput(0).getFrameHeight();
if (!height_) height_ = layerConf.height();
width_ = getInput(0).getFrameWidth();
if (!width_) width_ = layerConf.width();
channels_ = getInputValue(0)->getWidth() / width_ / height_;
size_t batchSize = getInput(0).getBatchSize();
size_t numROIs = getInput(1).getBatchSize();
real* bottomData = getInputValue(0)->getData();
size_t batchOffset = getInputValue(0)->getWidth();
size_t channelOffset = height_ * width_;
real* bottomROIs = getInputValue(1)->getData();
size_t roiOffset = getInputValue(1)->getWidth();
size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
real* outputData = getOutputValue()->getData();
Matrix::resizeOrCreate(maxIdxs_,
numROIs,
channels_ * pooledHeight_ * pooledWidth_,
false,
false);
real* argmaxData = maxIdxs_->getData();
size_t uZero = 0;
size_t uOne = 1;
for (size_t n = 0; n < numROIs; ++n) {
size_t roiBatchIdx = bottomROIs[0];
size_t roiStartW = std::round(bottomROIs[1] * spatialScale_);
size_t roiStartH = std::round(bottomROIs[2] * spatialScale_);
size_t roiEndW = std::round(bottomROIs[3] * spatialScale_);
size_t roiEndH = std::round(bottomROIs[4] * spatialScale_);
CHECK_GE(roiBatchIdx, 0);
CHECK_LT(roiBatchIdx, batchSize);
size_t roiHeight = std::max(roiEndH - roiStartH + 1, uOne);
size_t roiWidth = std::max(roiEndW - roiStartW + 1, uOne);
real binSizeH =
static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
real binSizeW =
static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
real* batchData = bottomData + batchOffset * roiBatchIdx;
for (size_t c = 0; c < channels_; ++c) {
for (size_t ph = 0; ph < pooledHeight_; ++ph) {
for (size_t pw = 0; pw < pooledWidth_; ++pw) {
size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
hstart = std::min(std::max(hstart + roiStartH, uZero), height_);
wstart = std::min(std::max(wstart + roiStartW, uZero), width_);
hend = std::min(std::max(hend + roiStartH, uZero), height_);
wend = std::min(std::max(wend + roiStartW, uZero), width_);
bool isEmpty = (hend <= hstart) || (wend <= wstart);
size_t poolIndex = ph * pooledWidth_ + pw;
if (isEmpty) {
outputData[poolIndex] = 0;
argmaxData[poolIndex] = -1;
}
for (size_t h = hstart; h < hend; ++h) {
for (size_t w = wstart; w < wend; ++w) {
size_t index = h * width_ + w;
if (batchData[index] > outputData[poolIndex]) {
outputData[poolIndex] = batchData[index];
argmaxData[poolIndex] = index;
}
}
}
}
}
batchData += channelOffset;
outputData += poolChannelOffset;
argmaxData += poolChannelOffset;
}
bottomROIs += roiOffset;
}
}
void ROIPoolLayer::backward(const UpdateCallback& callback) {
real* bottomROIs = getInputValue(1)->getData();
size_t numROIs = getInput(1).getBatchSize();
size_t roiOffset = getInputValue(1)->getWidth();
MatrixPtr inGrad = getInputGrad(0);
real* inDiffData = inGrad->getData();
size_t batchOffset = getInputValue(0)->getWidth();
size_t channelOffset = height_ * width_;
MatrixPtr outGrad = getOutputGrad();
real* outDiffData = outGrad->getData();
size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
real* argmaxData = maxIdxs_->getData();
for (size_t n = 0; n < numROIs; ++n) {
size_t roiBatchIdx = bottomROIs[0];
real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
for (size_t c = 0; c < channels_; ++c) {
for (size_t ph = 0; ph < pooledHeight_; ++ph) {
for (size_t pw = 0; pw < pooledWidth_; ++pw) {
size_t poolIndex = ph * pooledWidth_ + pw;
if (argmaxData[poolIndex] > 0) {
size_t index = static_cast<size_t>(argmaxData[poolIndex]);
batchDiffData[index] += outDiffData[poolIndex];
}
}
}
batchDiffData += channelOffset;
outDiffData += poolChannelOffset;
argmaxData += poolChannelOffset;
}
bottomROIs += roiOffset;
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
namespace paddle {
/**
* A layer used by Fast R-CNN to extract feature maps of ROIs from the last
* feature map.
* - Input: This layer needs two input layers: The first input layer is a
* convolution layer; The second input layer contains the ROI data which is the
* output of ProposalLayer in Faster R-CNN. layers for generating bbox
* location offset and the classification confidence. - Output: The
* ROIs' feature map. Reference: Shaoqing Ren, Kaiming He, Ross Girshick, and
* Jian Sun. Faster R-CNN: Towards Real-Time Object Detection with Region
* Proposal
*/
class ROIPoolLayer : public Layer {
protected:
size_t channels_;
size_t width_;
size_t height_;
size_t pooledWidth_;
size_t pooledHeight_;
real spatialScale_;
MatrixPtr maxIdxs_;
public:
explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
} // namespace paddle
......@@ -1830,6 +1830,40 @@ TEST(Layer, CropLayer) {
}
}
TEST(Layer, roi_pool) {
TestConfig config;
config.layerConfig.set_type("roi_pool");
config.biasSize = 0;
LayerInputConfig* input = config.layerConfig.add_inputs();
ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
roiPoolConf->set_pooled_width(7);
roiPoolConf->set_pooled_height(7);
roiPoolConf->set_spatial_scale(1. / 16);
roiPoolConf->set_width(14);
roiPoolConf->set_height(14);
MatrixPtr roiValue = Matrix::create(10, 10, false, false);
roiValue->zeroMem();
real* roiData = roiValue->getData();
for (size_t i = 0; i < roiValue->getElementCnt() / 5; ++i) {
*roiData++ = std::rand() % 2;
*roiData++ = std::rand() % 224;
*roiData++ = std::rand() % 224;
size_t xMin = static_cast<size_t>(*(roiData - 2));
size_t yMin = static_cast<size_t>(*(roiData - 1));
*roiData++ = xMin + std::rand() % (224 - xMin);
*roiData++ = yMin + std::rand() % (224 - yMin);
}
config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "roi_pool", 5, false, useGpu, false);
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
......
......@@ -289,6 +289,14 @@ message DetectionOutputConfig {
optional uint32 width = 9 [default = 1];
}
message ROIPoolConfig {
required uint32 pooled_width = 1;
required uint32 pooled_height = 2;
required float spatial_scale = 3;
optional uint32 height = 4 [default = 1];
optional uint32 width = 5 [default = 1];
}
message LayerInputConfig {
required string input_layer_name = 1;
optional string input_parameter_name = 2;
......@@ -309,6 +317,7 @@ message LayerInputConfig {
optional RowConvConfig row_conv_conf = 15;
optional MultiBoxLossConfig multibox_loss_conf = 16;
optional DetectionOutputConfig detection_output_conf = 17;
optional ROIPoolConfig roi_pool_conf = 18;
}
message LayerConfig {
......
......@@ -1732,6 +1732,17 @@ class DetectionOutputLayer(LayerBase):
self.config.size = size
@config_layer('roi_pool')
class ROIPoolLayer(LayerBase):
def __init__(self, name, inputs, pooled_width, pooled_height,
spatial_scale):
super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
@config_layer('data')
class DataLayer(LayerBase):
def __init__(self, name, size, height=None, width=None, device=None):
......
......@@ -117,6 +117,7 @@ __all__ = [
'cross_channel_norm_layer',
'multibox_loss_layer',
'detection_output_layer',
'roi_pool_layer',
'spp_layer',
'pad_layer',
'eos_layer',
......@@ -201,6 +202,7 @@ class LayerType(object):
PRIORBOX_LAYER = 'priorbox'
MULTIBOX_LOSS_LAYER = 'multibox_loss'
DETECTION_OUTPUT_LAYER = 'detection_output'
ROI_POOL_LAYER = 'roi_pool'
CTC_LAYER = 'ctc'
WARP_CTC_LAYER = 'warp_ctc'
......@@ -1200,6 +1202,41 @@ def detection_output_layer(input_loc,
name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
@wrap_name_default("roi_pool")
def roi_pool_layer(input,
rois,
pooled_width,
pooled_height,
spatial_scale,
name=None):
"""
A layer used by Fast R-CNN to extract feature maps of ROIs from the last
feature map.
:param name: The Layer Name.
:type name: basestring
:param input: The input layer.
:type input: LayerOutput.
:param rois: The input ROIs' data.
:type rois: LayerOutput.
:param pooled_width: The width after pooling.
:type pooled_width: int
:param pooled_height: The height after pooling.
:type pooled_height: int
:param spatial_scale: The spatial scale between the image and feature map.
:type spatial_scale: float
:return: LayerOutput
"""
Layer(
name=name,
type=LayerType.ROI_POOL_LAYER,
inputs=[input.name, rois.name],
pooled_width=pooled_width,
pooled_height=pooled_height,
spatial_scale=spatial_scale)
return LayerOutput(name, LayerType.ROI_POOL_LAYER, parents=[input, rois])
@wrap_name_default("cross_channel_norm")
def cross_channel_norm_layer(input, name=None, param_attr=None):
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册