From 50aa6ba6f54f57599486189f8718d9c40ac24eef Mon Sep 17 00:00:00 2001 From: Yuan Gao Date: Wed, 11 Jul 2018 12:01:31 +0800 Subject: [PATCH] add rpn target assign op (#11449) * Add region proposal network (RPN) target assign operator and Python API for Faster-RCNN. --- .../fluid/operators/detection/CMakeLists.txt | 3 +- .../detection/rpn_target_assign_op.cc | 282 ++++++++++++++++++ python/paddle/fluid/layers/detection.py | 131 +++++++- .../unittests/test_rpn_target_assign_op.py | 103 +++++++ 4 files changed, 517 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/detection/rpn_target_assign_op.cc create mode 100644 python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 6d296ff7bf1..a44d84cd7b9 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -27,7 +27,8 @@ anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc target_assign_op.cu) detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc - polygon_box_transform_op.cu) +polygon_box_transform_op.cu) +detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) # Export local libraries to parent set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc new file mode 100644 index 00000000000..3b0c9b28865 --- /dev/null +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -0,0 +1,282 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +template +using EigenMatrix = framework::EigenMatrix; + +class RpnTargetAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("DistMat"), + "Input(DistMat) of RpnTargetAssignOp should not be null"); + + PADDLE_ENFORCE( + ctx->HasOutput("LocationIndex"), + "Output(LocationIndex) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("ScoreIndex"), + "Output(ScoreIndex) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("TargetLabel"), + "Output(TargetLabel) of RpnTargetAssignOp should not be null"); + + auto in_dims = ctx->GetInputDim("DistMat"); + PADDLE_ENFORCE_EQ(in_dims.size(), 2, + "The rank of Input(DistMat) must be 2."); + } +}; + +template +class RpnTargetAssignKernel : public framework::OpKernel { + public: + void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, + const int row, const int col, const float pos_threshold, + const float neg_threshold, int64_t* target_label_data, + std::vector* fg_inds, std::vector* bg_inds) const { + int fg_offset = fg_inds->size(); + int bg_offset = bg_inds->size(); + for (int64_t i = 0; i < row; ++i) { + const T* v = dist_data + i * col; + T max_dist = *std::max_element(v, v + col); + for (int64_t j = 0; j < col; ++j) { + T val = dist_data[i * col + j]; + if (val == max_dist) target_label_data[j] = 1; + } + } + + // Pick the fg/bg and count the number + for (int64_t j = 0; j < col; ++j) { + if (anchor_to_gt_max.data()[j] > pos_threshold) { + target_label_data[j] = 1; + } else if (anchor_to_gt_max.data()[j] < neg_threshold) { + target_label_data[j] = 0; + } + if (target_label_data[j] == 1) { + fg_inds->push_back(fg_offset + j); + } else if (target_label_data[j] == 0) { + bg_inds->push_back(bg_offset + j); + } + } + } + + void ReservoirSampling(const int num, const int offset, + std::minstd_rand engine, + std::vector* inds) const { + std::uniform_real_distribution uniform(0, 1); + if (inds->size() > num) { + for (int i = num; i < inds->size(); ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < num) + std::iter_swap(inds->begin() + rng_ind + offset, + inds->begin() + i + offset); + } + } + } + + void RpnTargetAssign(const framework::ExecutionContext& ctx, + const Tensor& dist, const float pos_threshold, + const float neg_threshold, const int rpn_batch_size, + const int fg_num, std::minstd_rand engine, + std::vector* fg_inds, std::vector* bg_inds, + int64_t* target_label_data) const { + auto* dist_data = dist.data(); + int64_t row = dist.dims()[0]; + int64_t col = dist.dims()[1]; + int fg_offset = fg_inds->size(); + int bg_offset = bg_inds->size(); + + // Calculate the max IoU between anchors and gt boxes + Tensor anchor_to_gt_max; + anchor_to_gt_max.mutable_data( + framework::make_ddim({static_cast(col), 1}), + platform::CPUPlace()); + auto& place = *ctx.template device_context() + .eigen_device(); + auto x = EigenMatrix::From(dist); + auto x_col_max = EigenMatrix::From(anchor_to_gt_max); + x_col_max.device(place) = + x.maximum(Eigen::DSizes(0)) + .reshape(Eigen::DSizes(static_cast(col), 1)); + // Follow the Faster RCNN's implementation + ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, + neg_threshold, target_label_data, fg_inds, bg_inds); + // Reservoir Sampling + ReservoirSampling(fg_num, fg_offset, engine, fg_inds); + int bg_num = rpn_batch_size - fg_inds->size(); + ReservoirSampling(bg_num, bg_offset, engine, bg_inds); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* dist = context.Input("DistMat"); + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_lbl = context.Output("TargetLabel"); + + auto col = dist->dims()[1]; + int64_t n = dist->lod().size() == 0UL + ? 1 + : static_cast(dist->lod().back().size() - 1); + if (dist->lod().size()) { + PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); + float pos_threshold = context.Attr("rpn_positive_overlap"); + float neg_threshold = context.Attr("rpn_negative_overlap"); + float fg_fraction = context.Attr("fg_fraction"); + + int fg_num = static_cast(rpn_batch_size * fg_fraction); + + int64_t* target_label_data = + tgt_lbl->mutable_data({n * col, 1}, context.GetPlace()); + + auto& dev_ctx = context.device_context(); + math::SetConstant iset; + iset(dev_ctx, tgt_lbl, static_cast(-1)); + + std::vector fg_inds; + std::vector bg_inds; + std::random_device rnd; + std::minstd_rand engine; + int seed = + context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + engine.seed(seed); + + if (n == 1) { + RpnTargetAssign(context, *dist, pos_threshold, neg_threshold, + rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds, + target_label_data); + } else { + auto lod = dist->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + Tensor one_ins = dist->Slice(lod[i], lod[i + 1]); + RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold, + rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds, + target_label_data + i * col); + } + } + int* loc_index_data = loc_index->mutable_data( + {static_cast(fg_inds.size())}, context.GetPlace()); + int* score_index_data = score_index->mutable_data( + {static_cast(fg_inds.size() + bg_inds.size())}, + context.GetPlace()); + memcpy(loc_index_data, reinterpret_cast(&fg_inds[0]), + fg_inds.size() * sizeof(int)); + memcpy(score_index_data, reinterpret_cast(&fg_inds[0]), + fg_inds.size() * sizeof(int)); + memcpy(score_index_data + fg_inds.size(), + reinterpret_cast(&bg_inds[0]), bg_inds.size() * sizeof(int)); + } +}; + +class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "DistMat", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[K, M]. It is pair-wise distance matrix between the entities " + "represented by each row and each column. For example, assumed one " + "entity is A with shape [K], another entity is B with shape [M]. The " + "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " + "the distance is, the better macthing the pairs are. Please note, " + "This tensor can contain LoD information to represent a batch of " + "inputs. One instance of this batch can contain different numbers of " + "entities."); + AddAttr( + "rpn_positive_overlap", + "Minimum overlap required between an anchor and ground-truth " + "box for the (anchor, gt box) pair to be a positive example.") + .SetDefault(0.7); + AddAttr( + "rpn_negative_overlap", + "Maximum overlap allowed between an anchor and ground-truth " + "box for the (anchor, gt box) pair to be a negative examples.") + .SetDefault(0.3); + AddAttr( + "fg_fraction", + "Target fraction of RoI minibatch that " + "is labeled foreground (i.e. class > 0), 0-th class is background.") + .SetDefault(0.25); + AddAttr("rpn_batch_size_per_im", + "Total number of RPN examples per image.") + .SetDefault(256); + AddAttr("fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random mask. NOTE: DO NOT set this flag to true in " + "training. Setting this flag to true is only useful in " + "unittest.") + .SetDefault(false); + AddAttr("seed", "RpnTargetAssign random seed.").SetDefault(0); + AddOutput( + "LocationIndex", + "(Tensor), The indexes of foreground anchors in all RPN anchors, the " + "shape of the LocationIndex is [F], F depends on the value of input " + "tensor and attributes."); + AddOutput( + "ScoreIndex", + "(Tensor), The indexes of foreground and background anchors in all " + "RPN anchors(The rest anchors are ignored). The shape of the " + "ScoreIndex is [F + B], F and B depend on the value of input " + "tensor and attributes."); + AddOutput("TargetLabel", + "(Tensor), The target labels of each anchor with shape " + "[K * M, 1], " + "K and M is the same as they are in DistMat."); + AddComment(R"DOC( +This operator can be, for given the IoU between the ground truth bboxes and the +anchors, to assign classification and regression targets to each prediction. +The Score index and LocationIndex will be generated according to the DistMat. +The rest anchors would not contibute to the RPN training loss + +ScoreIndex is composed of foreground anchor indexes(positive labels) and +background anchor indexes(negative labels). LocationIndex is exactly same +as the foreground anchor indexes since we can not assign regression target to +the background anchors. + +The classification targets(TargetLabel) is a binary class label (of being +an object or not). Following the paper of Faster-RCNN, the positive labels +are two kinds of anchors: (i) the anchor/anchors with the highest IoU +overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap +higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that +a single ground-truth box may assign positive labels to multiple anchors. +A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap +(0.3) for all ground-truth boxes. Anchors that are neither positive nor +negative do not contribute to the training objective. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp, + ops::RpnTargetAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel, + ops::RpnTargetAssignKernel); diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 6af01297df5..bcfc716739b 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -30,6 +30,7 @@ __all__ = [ 'detection_output', 'ssd_loss', 'detection_map', + 'rpn_target_assign', 'anchor_generator', ] @@ -44,6 +45,135 @@ for _OP in set(__auto__): globals()[_OP] = generate_layer_fn(_OP) +def rpn_target_assign(loc, + scores, + anchor_box, + gt_box, + rpn_batch_size_per_im=256, + fg_fraction=0.25, + rpn_positive_overlap=0.7, + rpn_negative_overlap=0.3): + """ + ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** + + This layer can be, for given the Intersection-over-Union (IoU) overlap + between anchors and ground truth boxes, to assign classification and + regression targets to each each anchor, these target labels are used for + train RPN. The classification targets is a binary class label (of being + an object or not). Following the paper of Faster-RCNN, the positive labels + are two kinds of anchors: (i) the anchor/anchors with the highest IoU + overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap + higher than rpn_positive_overlap(0.7) with any ground-truth box. Note + that a single ground-truth box may assign positive labels to multiple + anchors. A non-positive anchor is when its IoU ratio is lower than + rpn_negative_overlap (0.3) for all ground-truth boxes. Anchors that are + neither positive nor negative do not contribute to the training objective. + The regression targets are the encoded ground-truth boxes associated with + the positive anchors. + + Args: + loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the + predicted locations of M bounding bboxes. N is the batch size, + and each bounding box has four coordinate values and the layout + is [xmin, ymin, xmax, ymax]. + scores(Variable): A 3-D Tensor with shape [N, M, C] represents the + predicted confidence predictions. N is the batch size, C is the + class number, M is number of bounding boxes. For each category + there are total M scores which corresponding M bounding boxes. + anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes, + each box is represented as [xmin, ymin, xmax, ymax], + [xmin, ymin] is the left top coordinate of the anchor box, + if the input is image feature map, they are close to the origin + of the coordinate system. [xmax, ymax] is the right bottom + coordinate of the anchor box. + gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D + LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth + bboxes of mini-batch input. + rpn_batch_size_per_im(int): Total number of RPN examples per image. + fg_fraction(float): Target fraction of RoI minibatch that is labeled + foreground (i.e. class > 0), 0-th class is background. + rpn_positive_overlap(float): Minimum overlap required between an anchor + and ground-truth box for the (anchor, gt box) pair to be a positive + example. + rpn_negative_overlap(float): Maximum overlap allowed between an anchor + and ground-truth box for the (anchor, gt box) pair to be a negative + examples. + + Returns: + tuple: + A tuple(predicted_scores, predicted_location, target_label, + target_bbox) is returned. The predicted_scores and + predicted_location is the predicted result of the RPN. + The target_label and target_bbox is the ground truth, + respectively. The predicted_location is a 2D Tensor with shape + [F, 4], and the shape of target_bbox is same as the shape of + the predicted_location, F is the number of the foreground + anchors. The predicted_scores is a 2D Tensor with shape + [F + B, 1], and the shape of target_label is same as the shape + of the predicted_scores, B is the number of the background + anchors, the F and B is depends on the input of this operator. + + Examples: + .. code-block:: python + + loc = layers.data(name='location', shape=[2, 80], + append_batch_size=False, dtype='float32') + scores = layers.data(name='scores', shape=[2, 40], + append_batch_size=False, dtype='float32') + anchor_box = layers.data(name='anchor_box', shape=[20, 4], + append_batch_size=False, dtype='float32') + gt_box = layers.data(name='gt_box', shape=[10, 4], + append_batch_size=False, dtype='float32') + loc_pred, score_pred, loc_target, score_target = + fluid.layers.detection_output(loc=location, + scores=scores, + anchor_box=anchor_box, + gt_box=gt_box) + """ + + helper = LayerHelper('rpn_target_assign', **locals()) + # 1. Compute the regression target bboxes + target_bbox = box_coder( + prior_box=anchor_box, + target_box=gt_box, + code_type='encode_center_size', + box_normalized=False) + + # 2. Compute overlaps between the prior boxes and the gt boxes overlaps + iou = iou_similarity(x=gt_box, y=anchor_box) + + # 3. Assign target label to anchors + loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype) + score_index = helper.create_tmp_variable(dtype=anchor_box.dtype) + target_label = helper.create_tmp_variable(dtype=anchor_box.dtype) + helper.append_op( + type="rpn_target_assign", + inputs={'Overlap': iou, }, + outputs={ + 'LocationIndex': loc_index, + 'ScoreIndex': score_index, + 'TargetLabel': target_label, + }, + attrs={ + 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_positive_overlap': rpn_positive_overlap, + 'rpn_negative_overlap': rpn_negative_overlap, + 'fg_fraction': fg_fraction, + }) + + # 4. Reshape and gather the target entry + scores = nn.reshape(x=scores, shape=(-1, 1)) + loc = nn.reshape(x=loc, shape=(-1, 4)) + target_label = nn.reshape(x=target_label, shape=(-1, 1)) + target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4)) + + predicted_scores = nn.gather(scores, score_index) + predicted_location = nn.gather(loc, loc_index) + target_label = nn.gather(target_label, score_index) + target_bbox = nn.gather(target_bbox, loc_index) + return predicted_scores, predicted_loc, target_label, target_bbox + + def detection_output(loc, scores, prior_box, @@ -388,7 +518,6 @@ def target_assign(input, Returns: tuple: - A tuple(out, out_weight) is returned. out is a 3D Tensor with shape [N, P, K], N and P is the same as they are in `neg_indices`, K is the same as it in input of X. If diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py new file mode 100644 index 00000000000..df6e0faaca6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -0,0 +1,103 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest + + +def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, fg_fraction): + iou = np.transpose(iou) + anchor_to_gt_max = iou.max(axis=1) + gt_to_anchor_argmax = iou.argmax(axis=0) + gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] + anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] + + tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1 + tgt_lbl[anchors_with_max_overlap] = 1 + tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1 + + num_fg = int(fg_fraction * rpn_batch_size_per_im) + fg_inds = np.where(tgt_lbl == 1)[0] + if len(fg_inds) > num_fg: + disable_inds = np.random.choice( + fg_inds, size=(len(fg_inds) - num_fg), replace=False) + tgt_lbl[disable_inds] = -1 + fg_inds = np.where(tgt_lbl == 1)[0] + + num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) + bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] + if len(bg_inds) > num_bg: + enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] + tgt_lbl[enable_inds] = 0 + bg_inds = np.where(tgt_lbl == 0)[0] + + loc_index = fg_inds + score_index = np.hstack((fg_inds, bg_inds)) + tgt_lbl = np.expand_dims(tgt_lbl, axis=1) + return loc_index, score_index, tgt_lbl + + +class TestRpnTargetAssignOp(OpTest): + def setUp(self): + iou = np.random.random((10, 8)).astype("float32") + self.op_type = "rpn_target_assign" + self.inputs = {'DistMat': iou} + self.attrs = { + 'rpn_batch_size_per_im': 256, + 'rpn_positive_overlap': 0.95, + 'rpn_negative_overlap': 0.3, + 'fg_fraction': 0.25, + 'fix_seed': True + } + loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3, + 0.25) + self.outputs = { + 'LocationIndex': loc_index, + 'ScoreIndex': score_index, + 'TargetLabel': tgt_lbl, + } + + def test_check_output(self): + self.check_output() + + +class TestRpnTargetAssignOp2(OpTest): + def setUp(self): + iou = np.random.random((10, 20)).astype("float32") + self.op_type = "rpn_target_assign" + self.inputs = {'DistMat': iou} + self.attrs = { + 'rpn_batch_size_per_im': 128, + 'rpn_positive_overlap': 0.5, + 'rpn_negative_overlap': 0.5, + 'fg_fraction': 0.5, + 'fix_seed': True + } + loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5, + 0.5) + self.outputs = { + 'LocationIndex': loc_index, + 'ScoreIndex': score_index, + 'TargetLabel': tgt_lbl, + } + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() -- GitLab