diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 8840765841d2bce3660f172be9e10c363977e678..0247093d03a9142bfdfb01927b6ad6a75e549778 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -77,17 +77,20 @@ struct BoxDecodeAndClipFunctor { const T *var; const int *index; const T *im_info; + const bool pixel_offset; T *proposals; BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var, - const int *index, const T *im_info, T *proposals) + const int *index, const T *im_info, T *proposals, + bool pixel_offset = true) : anchor(anchor), deltas(deltas), var(var), index(index), im_info(im_info), - proposals(proposals) {} + proposals(proposals), + pixel_offset(pixel_offset) {} T bbox_clip_default{static_cast(kBBoxClipDefault)}; @@ -98,8 +101,9 @@ struct BoxDecodeAndClipFunctor { T axmax = anchor[k + 2]; T aymax = anchor[k + 3]; - T w = axmax - axmin + 1.0; - T h = aymax - aymin + 1.0; + T offset = pixel_offset ? static_cast(1.0) : 0; + T w = axmax - axmin + offset; + T h = aymax - aymin + offset; T cx = axmin + 0.5 * w; T cy = aymin + 0.5 * h; @@ -123,13 +127,13 @@ struct BoxDecodeAndClipFunctor { T oxmin = d_cx - d_w * 0.5; T oymin = d_cy - d_h * 0.5; - T oxmax = d_cx + d_w * 0.5 - 1.; - T oymax = d_cy + d_h * 0.5 - 1.; + T oxmax = d_cx + d_w * 0.5 - offset; + T oymax = d_cy + d_h * 0.5 - offset; - proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); - proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); - proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); - proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); + proposals[i * 4] = Max(Min(oxmin, im_info[1] - offset), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - offset), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - offset), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - offset), 0.); } __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; } @@ -141,7 +145,8 @@ template static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, const T min_size, const int num, int *keep_num, int *keep, - bool is_scale = true) { + bool is_scale = true, + bool pixel_offset = true) { T im_h = im_info[0]; T im_w = im_info[1]; @@ -157,19 +162,25 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, T ymin = bboxes[k + 1]; T xmax = bboxes[k + 2]; T ymax = bboxes[k + 3]; + T offset = pixel_offset ? static_cast(1.0) : 0; + T w = xmax - xmin + offset; + T h = ymax - ymin + offset; + if (pixel_offset) { + T cx = xmin + w / 2.; + T cy = ymin + h / 2.; + + if (is_scale) { + w = (xmax - xmin) / im_info[2] + 1.; + h = (ymax - ymin) / im_info[2] + 1.; + } - T w = xmax - xmin + 1.0; - T h = ymax - ymin + 1.0; - T cx = xmin + w / 2.; - T cy = ymin + h / 2.; - - if (is_scale) { - w = (xmax - xmin) / im_info[2] + 1.; - h = (ymax - ymin) / im_info[2] + 1.; - } - - if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) { - keep_index[threadIdx.x] = i; + if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = i; + } + } else { + if (w >= min_size && h >= min_size) { + keep_index[threadIdx.x] = i; + } } __syncthreads(); if (threadIdx.x == 0) { @@ -187,19 +198,23 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, } } -static __device__ float IoU(const float *a, const float *b) { +static __device__ float IoU(const float *a, const float *b, + const bool pixel_offset = true) { + float offset = pixel_offset ? static_cast(1.0) : 0; float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); - float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float width = max(right - left + offset, 0.f), + height = max(bottom - top + offset, 0.f); float inter_s = width * height; - float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); - float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + float s_a = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); + float s_b = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); return inter_s / (s_a + s_b - inter_s); } static __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, - const float *dev_boxes, uint64_t *dev_mask) { + const float *dev_boxes, uint64_t *dev_mask, + bool pixel_offset = true) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; @@ -231,7 +246,8 @@ static __global__ void NMSKernel(const int n_boxes, start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { - if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { + if (IoU(cur_box, block_boxes + i * 4, pixel_offset) > + nms_overlap_thresh) { t |= 1ULL << i; } } @@ -243,7 +259,7 @@ static __global__ void NMSKernel(const int n_boxes, template static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const Tensor &sorted_indices, const T nms_threshold, - Tensor *keep_out) { + Tensor *keep_out, bool pixel_offset = true) { int boxes_num = proposals.dims()[0]; const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), @@ -255,7 +271,8 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, framework::Vector mask(boxes_num * col_blocks); NMSKernel<<>>(boxes_num, nms_threshold, boxes, mask.CUDAMutableData(BOOST_GET_CONST( - platform::CUDAPlace, ctx.GetPlace()))); + platform::CUDAPlace, ctx.GetPlace())), + pixel_offset); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index b7a23c48fb8c7bdfd4ce4fd78f42c703ddcae43e..b262f05d6b187a39e6fca2086aab0fe7b23b5944 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -31,7 +31,7 @@ struct RangeInitFunctor { }; template -inline HOSTDEVICE T RoIArea(const T* box, bool normalized) { +inline HOSTDEVICE T RoIArea(const T* box, bool pixel_offset = true) { if (box[2] < box[0] || box[3] < box[1]) { // If coordinate values are is invalid // (e.g. xmax < xmin or ymax < ymin), return 0. @@ -39,11 +39,11 @@ inline HOSTDEVICE T RoIArea(const T* box, bool normalized) { } else { const T w = box[2] - box[0]; const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { + if (pixel_offset) { // If coordinate values are not within range [0, 1]. return (w + 1) * (h + 1); + } else { + return w * h; } } } @@ -157,10 +157,12 @@ template void ClipTiledBoxes(const platform::DeviceContext& ctx, const framework::Tensor& im_info, const framework::Tensor& input_boxes, - framework::Tensor* out, bool is_scale = true) { + framework::Tensor* out, bool is_scale = true, + bool pixel_offset = true) { T* out_data = out->mutable_data(ctx.GetPlace()); const T* im_info_data = im_info.data(); const T* input_boxes_data = input_boxes.data(); + T offset = pixel_offset ? static_cast(1.0) : 0; T zero(0); T im_w = is_scale ? round(im_info_data[1] / im_info_data[2]) : im_info_data[1]; @@ -168,13 +170,17 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx, is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0]; for (int64_t i = 0; i < input_boxes.numel(); ++i) { if (i % 4 == 0) { - out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + out_data[i] = + std::max(std::min(input_boxes_data[i], im_w - offset), zero); } else if (i % 4 == 1) { - out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + out_data[i] = + std::max(std::min(input_boxes_data[i], im_h - offset), zero); } else if (i % 4 == 2) { - out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + out_data[i] = + std::max(std::min(input_boxes_data[i], im_w - offset), zero); } else { - out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + out_data[i] = + std::max(std::min(input_boxes_data[i], im_h - offset), zero); } } } @@ -184,29 +190,35 @@ template void FilterBoxes(const platform::DeviceContext& ctx, const framework::Tensor* boxes, float min_size, const framework::Tensor& im_info, bool is_scale, - framework::Tensor* keep) { + framework::Tensor* keep, bool pixel_offset = true) { const T* im_info_data = im_info.data(); const T* boxes_data = boxes->data(); keep->Resize({boxes->dims()[0]}); min_size = std::max(min_size, 1.0f); int* keep_data = keep->mutable_data(ctx.GetPlace()); + T offset = pixel_offset ? static_cast(1.0) : 0; int keep_len = 0; for (int i = 0; i < boxes->dims()[0]; ++i) { - T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; - T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; - T x_ctr = boxes_data[4 * i] + ws / 2; - T y_ctr = boxes_data[4 * i + 1] + hs / 2; + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + offset; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + offset; + if (pixel_offset) { + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (is_scale) { - ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1; - hs = - (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + 1; - } - - if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && - y_ctr <= im_info_data[0]) { - keep_data[keep_len++] = i; + if (is_scale) { + ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1; + hs = (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + + 1; + } + if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && + y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } else { + if (ws >= min_size && hs >= min_size) { + keep_data[keep_len++] = i; + } } } keep->Resize({keep_len}); @@ -216,8 +228,8 @@ template static void BoxCoder(const platform::DeviceContext& ctx, framework::Tensor* all_anchors, framework::Tensor* bbox_deltas, - framework::Tensor* variances, - framework::Tensor* proposals) { + framework::Tensor* variances, framework::Tensor* proposals, + const bool pixel_offset = true) { T* proposals_data = proposals->mutable_data(ctx.GetPlace()); int64_t row = all_anchors->dims()[0]; @@ -230,9 +242,11 @@ static void BoxCoder(const platform::DeviceContext& ctx, variances_data = variances->data(); } + T offset = pixel_offset ? static_cast(1.0) : 0; for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + offset; + T anchor_height = + anchor_data[i * len + 3] - anchor_data[i * len + 1] + offset; T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; @@ -270,8 +284,8 @@ static void BoxCoder(const platform::DeviceContext& ctx, proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - offset; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - offset; } // return proposals; } diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc index b0c9d968e47b7968584d6af234fe1debcde153d0..4e514e62f4081edf8a6419da5771c6092d66b1c4 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -103,6 +103,9 @@ class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("refer_scale", "The referring scale of FPN layer with" " specified level"); + AddAttr("pixel_offset", "(bool, default True),", + "If true, im_shape pixel offset is 1.") + .SetDefault(true); AddComment(R"DOC( This operator distribute all proposals into different fpn level, with respect to scale of the proposals, the referring scale and @@ -134,4 +137,8 @@ REGISTER_OP_VERSION(distribute_fpn_proposals) .NewOutput("MultiLevelRoisNum", "The RoIs' number of each image on multiple " "levels. The number on each level has the shape of (B)," - "B is the number of images.")); + "B is the number of images.")) + .AddCheckpoint( + R"ROC(Register distribute_fpn_proposals for adding the attribute of pixel_offset)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "pixel_offset", "If true, im_shape pixel offset is 1.", true)); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 27c06a0f8fb207b5dc85c7875ea91428b16e606c..7550ff91fd542877599dd25e341f951bdb6e604b 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -43,15 +43,15 @@ __global__ void GPUDistFpnProposalsHelper( const int nthreads, const T* rois, const int lod_size, const int refer_level, const int refer_scale, const int max_level, const int min_level, int* roi_batch_id_data, int* sub_lod_list, - int* target_lvls) { + int* target_lvls, bool pixel_offset = true) { CUDA_KERNEL_LOOP(i, nthreads) { const T* offset_roi = rois + i * BBoxSize; int roi_batch_ind = roi_batch_id_data[i]; // get the target level of current rois - T roi_area = RoIArea(offset_roi, false); + T roi_area = RoIArea(offset_roi, pixel_offset); T roi_scale = sqrt(roi_area); int tgt_lvl = floor( - log2(roi_scale / static_cast(refer_scale) + (T)1e-6) + refer_level); + log2(roi_scale / static_cast(refer_scale) + (T)1e-8) + refer_level); tgt_lvl = min(max_level, max(tgt_lvl, min_level)); target_lvls[i] = tgt_lvl; // compute number of rois in the same batch and same target level @@ -73,6 +73,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { const int max_level = ctx.Attr("max_level"); const int refer_level = ctx.Attr("refer_level"); const int refer_scale = ctx.Attr("refer_scale"); + const bool pixel_offset = ctx.Attr("pixel_offset"); int num_level = max_level - min_level + 1; // check that the fpn_rois is not empty @@ -126,7 +127,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { GPUDistFpnProposalsHelper<<>>( roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, max_level, min_level, roi_batch_id_list_gpu.data(), - sub_lod_list_data, target_lvls_data); + sub_lod_list_data, target_lvls_data, pixel_offset); dev_ctx.Wait(); auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h index 465435637cff659082570f1ef9fcf1cb91983321..e3c125b0a6888575305c1961586cfea01565d21e 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -44,7 +44,7 @@ inline std::vector GetLodFromRoisNum(const Tensor* rois_num) { } template -static inline T BBoxArea(const T* box, bool normalized) { +static inline T BBoxArea(const T* box, bool pixel_offset) { if (box[2] < box[0] || box[3] < box[1]) { // If coordinate values are is invalid // (e.g. xmax < xmin or ymax < ymin), return 0. @@ -52,11 +52,11 @@ static inline T BBoxArea(const T* box, bool normalized) { } else { const T w = box[2] - box[0]; const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { + if (pixel_offset) { // If coordinate values are not within range [0, 1]. return (w + 1) * (h + 1); + } else { + return w * h; } } } @@ -77,6 +77,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { const int max_level = context.Attr("max_level"); const int refer_level = context.Attr("refer_level"); const int refer_scale = context.Attr("refer_scale"); + const bool pixel_offset = context.Attr("pixel_offset"); const int num_level = max_level - min_level + 1; // check that the fpn_rois is not empty @@ -108,7 +109,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { const T* rois_data = fpn_rois_slice.data(); for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { // get the target level of current rois - T roi_scale = std::sqrt(BBoxArea(rois_data, false)); + T roi_scale = std::sqrt(BBoxArea(rois_data, pixel_offset)); int tgt_lvl = std::floor(std::log2(roi_scale / refer_scale + (T)1e-6) + refer_level); tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level)); diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 7c2fd599fa6a2eee081361d0615efaa387563c50..44554a941dce4ba8d2dc4962a4f6f358f458c445 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -87,6 +87,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { float nms_thresh = context.Attr("nms_thresh"); float min_size = context.Attr("min_size"); float eta = context.Attr("eta"); + bool pixel_offset = context.Attr("pixel_offset"); auto &dev_ctx = context.template device_context(); @@ -134,10 +135,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); scores_slice.Resize({h_score * w_score * c_score, 1}); - std::pair tensor_pair = - ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances, - bbox_deltas_slice, scores_slice, pre_nms_top_n, - post_nms_top_n, nms_thresh, min_size, eta); + std::pair tensor_pair = ProposalForOneImage( + dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice, + scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, + eta, pixel_offset); Tensor &proposals = tensor_pair.first; Tensor &scores = tensor_pair.second; @@ -168,7 +169,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { const Tensor &bbox_deltas_slice, // [M, 4] const Tensor &scores_slice, // [N, 1] int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, - float eta) const { + float eta, bool pixel_offset = true) const { auto *scores_data = scores_slice.data(); // Sort index @@ -203,12 +204,15 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); + BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals, + pixel_offset); - ClipTiledBoxes(ctx, im_shape_slice, proposals, &proposals, false); + ClipTiledBoxes(ctx, im_shape_slice, proposals, &proposals, false, + pixel_offset); Tensor keep; - FilterBoxes(ctx, &proposals, min_size, im_shape_slice, false, &keep); + FilterBoxes(ctx, &proposals, min_size, im_shape_slice, false, &keep, + pixel_offset); // Handle the case when there is no keep index left if (keep.numel() == 0) { math::SetConstant set_zero; @@ -229,7 +233,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { return std::make_pair(bbox_sel, scores_filter); } - Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + Tensor keep_nms = + NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta, pixel_offset); if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { keep_nms.Resize({post_nms_top_n}); @@ -280,6 +285,9 @@ class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker { "Proposal height and width both need to be greater " "than this min_size."); AddAttr("eta", "The parameter for adaptive NMS."); + AddAttr("pixel_offset", "(bool, default True),", + "If true, im_shape pixel offset is 1.") + .SetDefault(true); AddComment(R"DOC( This operator is the second version of generate_proposals op to generate bounding box proposals for Faster RCNN. @@ -312,3 +320,8 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(generate_proposals_v2, ops::GenerateProposalsV2Kernel, ops::GenerateProposalsV2Kernel); +REGISTER_OP_VERSION(generate_proposals_v2) + .AddCheckpoint( + R"ROC(Registe generate_proposals_v2 for adding the attribute of pixel_offset)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "pixel_offset", "If true, im_shape pixel offset is 1.", true)); diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index 70020cdc64ef58a47624bd0ce27898e86623dfa3..6244827f685ba5bb1fc416787b529bf069e5ac66 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -36,7 +36,7 @@ static std::pair ProposalForOneImage( const Tensor &bbox_deltas, // [M, 4] const Tensor &scores, // [N, 1] int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, - float eta) { + float eta, bool pixel_offset) { // 1. pre nms Tensor scores_sort, index_sort; SortDescending(ctx, scores, &scores_sort, &index_sort); @@ -54,7 +54,8 @@ static std::pair ProposalForOneImage( platform::ForRange for_range(ctx, pre_nms_num); for_range(BoxDecodeAndClipFunctor{ anchors.data(), bbox_deltas.data(), variances.data(), - index_sort.data(), im_shape.data(), proposals.data()}); + index_sort.data(), im_shape.data(), proposals.data(), + pixel_offset}); } // 3. filter @@ -65,7 +66,7 @@ static std::pair ProposalForOneImage( auto stream = ctx.stream(); FilterBBoxes<<<1, 512, 0, stream>>>( proposals.data(), im_shape.data(), min_size, pre_nms_num, - keep_num_t.data(), keep_index.data(), false); + keep_num_t.data(), keep_index.data(), false, pixel_offset); int keep_num; const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, @@ -94,7 +95,8 @@ static std::pair ProposalForOneImage( // 4. nms Tensor keep_nms; - NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); + NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms, + pixel_offset); if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { keep_nms.Resize({post_nms_top_n}); } @@ -129,6 +131,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { float nms_thresh = context.Attr("nms_thresh"); float min_size = context.Attr("min_size"); float eta = context.Attr("eta"); + bool pixel_offset = context.Attr("pixel_offset"); PADDLE_ENFORCE_GE(eta, 1., platform::errors::InvalidArgument( "Not support adaptive NMS. The attribute 'eta' " @@ -184,10 +187,10 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); scores_slice.Resize({h_score * w_score * c_score, 1}); - std::pair box_score_pair = - ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances, - bbox_deltas_slice, scores_slice, pre_nms_top_n, - post_nms_top_n, nms_thresh, min_size, eta); + std::pair box_score_pair = ProposalForOneImage( + dev_ctx, im_shape_slice, anchors, variances, bbox_deltas_slice, + scores_slice, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, + eta, pixel_offset); Tensor &proposals = box_score_pair.first; Tensor &scores = box_score_pair.second; diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h index febdee8263553064ca1f59124fdf29df62aee1d8..0e448d42fc2ed37b900015d47ded8d9f6387e1cb 100644 --- a/paddle/fluid/operators/detection/nms_util.h +++ b/paddle/fluid/operators/detection/nms_util.h @@ -130,7 +130,7 @@ static inline framework::Tensor VectorToTensor( template framework::Tensor NMS(const platform::DeviceContext& ctx, framework::Tensor* bbox, framework::Tensor* scores, - T nms_threshold, float eta) { + T nms_threshold, float eta, bool pixel_offset = true) { int64_t num_boxes = bbox->dims()[0]; // 4: [xmin ymin xmax ymax] int64_t box_size = bbox->dims()[1]; @@ -144,13 +144,15 @@ framework::Tensor NMS(const platform::DeviceContext& ctx, int selected_num = 0; T adaptive_threshold = nms_threshold; const T* bbox_data = bbox->data(); + bool normalized = pixel_offset ? false : true; while (sorted_indices.size() != 0) { int idx = sorted_indices.back().second; bool flag = true; for (int kept_idx : selected_indices) { if (flag) { - T overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, false); + T overlap = + JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, normalized); flag = (overlap <= adaptive_threshold); } else { break; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 6a4a88a004586daf83716b3e3c2cd3ea2b4fa376..5627b4f229e100d9979663e8688b8694188bab0f 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -175,6 +175,10 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker { "If <=0, then grid points are adaptive to roi_width " "and pooled_w, likewise for height") .SetDefault(-1); + AddAttr("aligned", + "(bool, default False)," + "If true, pixel shift it by -0.5 for align more perfectly") + .SetDefault(false); AddComment(R"DOC( **RoIAlign Operator** @@ -242,7 +246,14 @@ REGISTER_OP_VERSION(roi_align) "it is not used in object detection models yet.")) .AddCheckpoint( R"ROC( - Upgrade roi_align add a new input [RoisNum])ROC", + Upgrade roi_align add a new input [RoisNum])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( "RoisNum", - "The number of RoIs in each image. RoisNum is dispensable.")); + "The number of RoIs in each image. RoisNum is dispensable.")) + .AddCheckpoint( + R"ROC( + Upgrade roi_align add a new input [aligned])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "aligned", + "If true, pixel shift it by -0.5 for align more perfectly.", + false)); diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 3a4ce55f4fb77160e7fc645539c1868fe2864b19..074a00fb1c33cea4d77cb08aa0eca37a765b554e 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -105,7 +105,8 @@ __global__ void GPUROIAlignForward( const int nthreads, const T* input_data, const T* input_rois, const float spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, - const int sampling_ratio, int* roi_batch_id_data, T* output_data) { + const int sampling_ratio, int* roi_batch_id_data, T* output_data, + const bool continuous_coordinate) { CUDA_KERNEL_LOOP(i, nthreads) { int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; @@ -115,13 +116,19 @@ __global__ void GPUROIAlignForward( const T* offset_input_rois = input_rois + n * kROISize; int roi_batch_ind = roi_batch_id_data[n]; - T roi_xmin = offset_input_rois[0] * spatial_scale; - T roi_ymin = offset_input_rois[1] * spatial_scale; - T roi_xmax = offset_input_rois[2] * spatial_scale; - T roi_ymax = offset_input_rois[3] * spatial_scale; + T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - T roi_width = max(roi_xmax - roi_xmin, static_cast(1.)); - T roi_height = max(roi_ymax - roi_ymin, static_cast(1.)); + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); @@ -153,14 +160,12 @@ __global__ void GPUROIAlignForward( } template -__global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois, - const T* out_grad, const int num_rois, - const float spatial_scale, - const int channels, const int height, - const int width, const int pooled_height, - const int pooled_width, - const int sampling_ratio, - int* roi_batch_id_data, T* input_grad) { +__global__ void GPUROIAlignBackward( + const int nthreads, const T* input_rois, const T* out_grad, + const int num_rois, const float spatial_scale, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int sampling_ratio, int* roi_batch_id_data, + T* input_grad, const bool continuous_coordinate) { CUDA_KERNEL_LOOP(i, nthreads) { int pw = i % pooled_width; int ph = (i / pooled_width) % pooled_height; @@ -169,13 +174,18 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois, const T* offset_input_rois = input_rois + n * kROISize; int roi_batch_ind = roi_batch_id_data[n]; - T roi_xmin = offset_input_rois[0] * spatial_scale; - T roi_ymin = offset_input_rois[1] * spatial_scale; - T roi_xmax = offset_input_rois[2] * spatial_scale; - T roi_ymax = offset_input_rois[3] * spatial_scale; - - T roi_width = max(roi_xmax - roi_xmin, static_cast(1.)); - T roi_height = max(roi_ymax - roi_ymin, static_cast(1.)); + T roi_offset = continuous_coordinate ? T(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); @@ -236,6 +246,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); auto in_dims = in->dims(); int batch_size = in_dims[0]; @@ -316,7 +327,7 @@ class GPUROIAlignOpKernel : public framework::OpKernel { GPUROIAlignForward<<>>( output_size, in->data(), rois->data(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data, - out->mutable_data(ctx.GetPlace())); + out->mutable_data(ctx.GetPlace()), aligned); } }; @@ -334,6 +345,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); int rois_num = rois->dims()[0]; int channels = in->dims()[1]; @@ -390,8 +402,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { GPUROIAlignBackward<<>>( output_grad_size, rois->data(), out_grad->data(), rois_num, spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, roi_id_data, - in_grad->mutable_data(ctx.GetPlace())); + sampling_ratio, roi_id_data, in_grad->mutable_data(ctx.GetPlace()), + aligned); } } }; diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h index 066125a92fbd9d1d49f0ba023366865620674e1f..d03cd617e6df6eb55d0596008fa828b26b4e010e 100644 --- a/paddle/fluid/operators/roi_align_op.h +++ b/paddle/fluid/operators/roi_align_op.h @@ -145,6 +145,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); auto& dev_ctx = ctx.template device_context(); @@ -215,15 +216,21 @@ class CPUROIAlignOpKernel : public framework::OpKernel { } T* output_data = out->mutable_data(ctx.GetPlace()); const T* rois_data = rois->data(); + T roi_offset = aligned ? T(0.5) : 0; for (int n = 0; n < rois_num; ++n) { int roi_batch_id = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale; - T roi_ymin = rois_data[1] * spatial_scale; - T roi_xmax = rois_data[2] * spatial_scale; - T roi_ymax = rois_data[3] * spatial_scale; + T roi_xmin = rois_data[0] * spatial_scale - roi_offset; + T roi_ymin = rois_data[1] * spatial_scale - roi_offset; + T roi_xmax = rois_data[2] * spatial_scale - roi_offset; + T roi_ymax = rois_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } - T roi_width = std::max(roi_xmax - roi_xmin, static_cast(1.)); - T roi_height = std::max(roi_ymax - roi_ymin, static_cast(1.)); T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* batch_data = input_data + roi_batch_id * in_stride[0]; @@ -290,6 +297,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel { auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); auto in_dims = in->dims(); + auto aligned = ctx.Attr("aligned"); int channels = in_dims[1]; int height = in_dims[2]; @@ -344,14 +352,21 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel { auto roi_stride = framework::stride(rois->dims()); auto out_stride = framework::stride(out_grad->dims()); + T roi_offset = aligned ? T(0.5) : 0; for (int n = 0; n < rois_num; ++n) { int roi_batch_idx = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale; - T roi_ymin = rois_data[1] * spatial_scale; - T roi_xmax = rois_data[2] * spatial_scale; - T roi_ymax = rois_data[3] * spatial_scale; - T roi_width = std::max(roi_xmax - roi_xmin, static_cast(1.)); - T roi_height = std::max(roi_ymax - roi_ymin, static_cast(1.)); + T roi_xmin = rois_data[0] * spatial_scale - roi_offset; + T roi_ymin = rois_data[1] * spatial_scale - roi_offset; + T roi_xmax = rois_data[2] * spatial_scale - roi_offset; + T roi_ymax = rois_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); for (int c = 0; c < channels; ++c) { diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py index ec0125b28ed1b870025adbfd2bd4ba78244bcc11..2cd7889d6e3aa629fa71dfe0c7b91613914000eb 100644 --- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py +++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py @@ -31,7 +31,8 @@ class TestDistributeFPNProposalsOp(OpTest): 'max_level': self.roi_max_level, 'min_level': self.roi_min_level, 'refer_scale': self.canonical_scale, - 'refer_level': self.canonical_level + 'refer_level': self.canonical_level, + 'pixel_offset': self.pixel_offset, } output = [('out%d' % i, self.rois_fpn[i]) for i in range(len(self.rois_fpn))] @@ -47,10 +48,12 @@ class TestDistributeFPNProposalsOp(OpTest): self.canonical_scale = 224 self.canonical_level = 4 self.images_shape = [512, 512] + self.pixel_offset = True def boxes_area(self, boxes): - w = (boxes[:, 2] - boxes[:, 0] + 1) - h = (boxes[:, 3] - boxes[:, 1] + 1) + offset = 1 if self.pixel_offset else 0 + w = (boxes[:, 2] - boxes[:, 0] + offset) + h = (boxes[:, 3] - boxes[:, 1] + offset) areas = w * h assert np.all(areas >= 0), 'Negative areas founds' return areas @@ -59,7 +62,7 @@ class TestDistributeFPNProposalsOp(OpTest): s = np.sqrt(self.boxes_area(rois)) s0 = self.canonical_scale lvl0 = self.canonical_level - target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6)) + target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-8)) target_lvls = np.clip(target_lvls, lvl_min, lvl_max) return target_lvls @@ -131,7 +134,8 @@ class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp): 'max_level': self.roi_max_level, 'min_level': self.roi_min_level, 'refer_scale': self.canonical_scale, - 'refer_level': self.canonical_level + 'refer_level': self.canonical_level, + 'pixel_offset': self.pixel_offset, } output = [('out%d' % i, self.rois_fpn[i]) for i in range(len(self.rois_fpn))] @@ -147,5 +151,16 @@ class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp): } +class TestDistributeFPNProposalsOpNoOffset( + TestDistributeFPNProposalsOpWithRoisNum): + def init_test_case(self): + self.roi_max_level = 5 + self.roi_min_level = 2 + self.canonical_scale = 224 + self.canonical_level = 4 + self.images_shape = [512, 512] + self.pixel_offset = False + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 8304016d7d0d66c8a658e3fd2318b6caa4882b85..6b9eeaa0867c104e829afb36ff5c39819a9bfd7a 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -21,7 +21,6 @@ import math import paddle import paddle.fluid as fluid from op_test import OpTest -from test_multiclass_nms_op import nms from test_anchor_generator_op import anchor_generator_in_python import copy @@ -111,18 +110,19 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores, return proposals, scores -def box_coder(all_anchors, bbox_deltas, variances): +def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True): """ Decode proposals by anchors and bbox_deltas from RPN """ + offset = 1 if pixel_offset else 0 #proposals: xmin, ymin, xmax, ymax proposals = np.zeros_like(bbox_deltas, dtype=np.float32) #anchor_loc: width, height, center_x, center_y anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) - anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1 - anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1 + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + offset + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + offset anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0] anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1] @@ -152,51 +152,60 @@ def box_coder(all_anchors, bbox_deltas, variances): pred_bbox[i, 3] = math.exp( min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i, 1] - proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 - proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1 - proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - offset + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - offset return proposals -def clip_tiled_boxes(boxes, im_shape): +def clip_tiled_boxes(boxes, im_shape, pixel_offset=True): """Clip boxes to image boundaries. im_shape is [height, width] and boxes has shape (N, 4 * num_tiled_boxes).""" assert boxes.shape[1] % 4 == 0, \ 'boxes.shape[1] is {:d}, but must be divisible by 4.'.format( boxes.shape[1] ) + offset = 1 if pixel_offset else 0 # x1 >= 0 - boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) + boxes[:, 0::4] = np.maximum( + np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0) # y1 >= 0 - boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) + boxes[:, 1::4] = np.maximum( + np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0) # x2 < im_shape[1] - boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) + boxes[:, 2::4] = np.maximum( + np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0) # y2 < im_shape[0] - boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) + boxes[:, 3::4] = np.maximum( + np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0) return boxes -def filter_boxes(boxes, min_size, im_info): +def filter_boxes(boxes, min_size, im_info, pixel_offset=True): """Only keep boxes with both sides >= min_size and center within the image. """ # Scale min_size to match image scale im_scale = im_info[2] min_size = max(min_size, 1.0) - ws = boxes[:, 2] - boxes[:, 0] + 1 - hs = boxes[:, 3] - boxes[:, 1] + 1 - ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 - hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 - x_ctr = boxes[:, 0] + ws / 2. - y_ctr = boxes[:, 1] + hs / 2. - keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) & - (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0] + offset = 1 if pixel_offset else 0 + ws = boxes[:, 2] - boxes[:, 0] + offset + hs = boxes[:, 3] - boxes[:, 1] + offset + if pixel_offset: + ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 + hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 + x_ctr = boxes[:, 0] + ws / 2. + y_ctr = boxes[:, 1] + hs / 2. + keep = np.where((ws_orig_scale >= min_size) & ( + hs_orig_scale >= min_size) & (x_ctr < im_info[1]) & (y_ctr < + im_info[0]))[0] + else: + keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep -def iou(box_a, box_b): +def iou(box_a, box_b, pixel_offset=True): """ Apply intersection-over-union overlap between box_a and box_b """ @@ -209,9 +218,9 @@ def iou(box_a, box_b): ymin_b = min(box_b[1], box_b[3]) xmax_b = max(box_b[0], box_b[2]) ymax_b = max(box_b[1], box_b[3]) - - area_a = (ymax_a - ymin_a + 1) * (xmax_a - xmin_a + 1) - area_b = (ymax_b - ymin_b + 1) * (xmax_b - xmin_b + 1) + offset = 1 if pixel_offset else 0 + area_a = (ymax_a - ymin_a + offset) * (xmax_a - xmin_a + offset) + area_b = (ymax_b - ymin_b + offset) * (xmax_b - xmin_b + offset) if area_a <= 0 and area_b <= 0: return 0.0 @@ -220,14 +229,14 @@ def iou(box_a, box_b): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0) + inter_area = max(xb - xa + offset, 0.0) * max(yb - ya + offset, 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) return iou_ratio -def nms(boxes, scores, nms_threshold, eta=1.0): +def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True): """Apply non-maximum suppression at test time to avoid detecting too many overlapping bounding boxes for a given object. Args: @@ -252,7 +261,9 @@ def nms(boxes, scores, nms_threshold, eta=1.0): for k in range(len(selected_indices)): if keep: kept_idx = selected_indices[k] - overlap = iou(boxes[idx], boxes[kept_idx]) + overlap = iou(boxes[idx], + boxes[kept_idx], + pixel_offset=pixel_offset) keep = True if overlap <= adaptive_threshold else False else: break diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py index 26c443008db5049a3e081243c1cb5b509e5d27e0..0a67004518771f2b14478b943643172888bec27c 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py @@ -21,7 +21,6 @@ import math import paddle import paddle.fluid as fluid from op_test import OpTest -from test_multiclass_nms_op import nms from test_anchor_generator_op import anchor_generator_in_python import copy from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms @@ -29,7 +28,7 @@ from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors, variances, pre_nms_topN, post_nms_topN, - nms_thresh, min_size, eta): + nms_thresh, min_size, eta, pixel_offset): all_anchors = anchors.reshape(-1, 4) rois = np.empty((0, 5), dtype=np.float32) roi_probs = np.empty((0, 1), dtype=np.float32) @@ -42,7 +41,8 @@ def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors, img_i_boxes, img_i_probs = proposal_for_one_image( im_shape[img_idx, :], all_anchors, variances, bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :], - pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta) + pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta, + pixel_offset) rois_num.append(img_i_probs.shape[0]) rpn_rois.append(img_i_boxes) rpn_roi_probs.append(img_i_probs) @@ -52,7 +52,7 @@ def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors, def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas, scores, pre_nms_topN, post_nms_topN, nms_thresh, - min_size, eta): + min_size, eta, pixel_offset): # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # - bbox deltas will be (4 * A, H, W) format from conv output @@ -83,12 +83,12 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas, scores = scores[order, :] bbox_deltas = bbox_deltas[order, :] all_anchors = all_anchors[order, :] - proposals = box_coder(all_anchors, bbox_deltas, variances) + proposals = box_coder(all_anchors, bbox_deltas, variances, pixel_offset) # clip proposals to image (may result in proposals with zero area # that will be removed in the next step) - proposals = clip_tiled_boxes(proposals, im_shape) + proposals = clip_tiled_boxes(proposals, im_shape, pixel_offset) # remove predicted boxes with height or width < min_size - keep = filter_boxes(proposals, min_size, im_shape) + keep = filter_boxes(proposals, min_size, im_shape, pixel_offset) if len(keep) == 0: proposals = np.zeros((1, 4)).astype('float32') scores = np.zeros((1, 1)).astype('float32') @@ -103,7 +103,8 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas, keep = nms(boxes=proposals, scores=scores, nms_threshold=nms_thresh, - eta=eta) + eta=eta, + pixel_offset=pixel_offset) if post_nms_topN > 0 and post_nms_topN < len(keep): keep = keep[:post_nms_topN] proposals = proposals[keep, :] @@ -112,17 +113,21 @@ def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas, return proposals, scores -def filter_boxes(boxes, min_size, im_shape): +def filter_boxes(boxes, min_size, im_shape, pixel_offset=True): """Only keep boxes with both sides >= min_size and center within the image. """ # Scale min_size to match image scale min_size = max(min_size, 1.0) - ws = boxes[:, 2] - boxes[:, 0] + 1 - hs = boxes[:, 3] - boxes[:, 1] + 1 - x_ctr = boxes[:, 0] + ws / 2. - y_ctr = boxes[:, 1] + hs / 2. - keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[1]) - & (y_ctr < im_shape[0]))[0] + offset = 1 if pixel_offset else 0 + ws = boxes[:, 2] - boxes[:, 0] + offset + hs = boxes[:, 3] - boxes[:, 1] + offset + if pixel_offset: + x_ctr = boxes[:, 0] + ws / 2. + y_ctr = boxes[:, 1] + hs / 2. + keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[ + 1]) & (y_ctr < im_shape[0]))[0] + else: + keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep @@ -144,7 +149,8 @@ class TestGenerateProposalsV2Op(OpTest): 'post_nms_topN': self.post_nms_topN, 'nms_thresh': self.nms_thresh, 'min_size': self.min_size, - 'eta': self.eta + 'eta': self.eta, + 'pixel_offset': self.pixel_offset, } self.outputs = { @@ -165,6 +171,7 @@ class TestGenerateProposalsV2Op(OpTest): self.nms_thresh = 0.7 self.min_size = 3.0 self.eta = 1. + self.pixel_offset = True def init_test_input(self): batch_size = 1 @@ -191,7 +198,7 @@ class TestGenerateProposalsV2Op(OpTest): self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_v2_in_python( self.scores, self.bbox_deltas, self.im_shape, self.anchors, self.variances, self.pre_nms_topN, self.post_nms_topN, - self.nms_thresh, self.min_size, self.eta) + self.nms_thresh, self.min_size, self.eta, self.pixel_offset) class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op): @@ -231,6 +238,17 @@ class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op): self.nms_thresh = 0.7 self.min_size = 1000.0 self.eta = 1. + self.pixel_offset = True + + +class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op): + def init_test_params(self): + self.pre_nms_topN = 12000 # train 12000, test 2000 + self.post_nms_topN = 5000 # train 6000, test 1000 + self.nms_thresh = 0.7 + self.min_size = 3.0 + self.eta = 1. + self.pixel_offset = False if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py index fb8a090b80700d9b884a72f7f430723754523a13..940a3e9f9605bb7cffa63f41bdd6c31428787d9f 100644 --- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py +++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py @@ -35,7 +35,8 @@ class TestROIAlignOp(OpTest): 'spatial_scale': self.spatial_scale, 'pooled_height': self.pooled_height, 'pooled_width': self.pooled_width, - 'sampling_ratio': self.sampling_ratio + 'sampling_ratio': self.sampling_ratio, + 'aligned': self.aligned, } self.outputs = {'Out': self.out_data} @@ -53,6 +54,7 @@ class TestROIAlignOp(OpTest): self.pooled_height = 2 self.pooled_width = 2 self.sampling_ratio = -1 + self.aligned = False self.x = np.random.random(self.x_dim).astype('float64') @@ -115,16 +117,21 @@ class TestROIAlignOp(OpTest): (self.rois_num, self.channels, self.pooled_height, self.pooled_width)).astype('float64') + offset = 0.5 if self.aligned else 0. for i in range(self.rois_num): roi = self.rois[i] roi_batch_id = int(roi[0]) x_i = self.x[roi_batch_id] - roi_xmin = roi[1] * self.spatial_scale - roi_ymin = roi[2] * self.spatial_scale - roi_xmax = roi[3] * self.spatial_scale - roi_ymax = roi[4] * self.spatial_scale - roi_width = max(roi_xmax - roi_xmin, 1) - roi_height = max(roi_ymax - roi_ymin, 1) + roi_xmin = roi[1] * self.spatial_scale - offset + roi_ymin = roi[2] * self.spatial_scale - offset + roi_xmax = roi[3] * self.spatial_scale - offset + roi_ymax = roi[4] * self.spatial_scale - offset + + roi_width = roi_xmax - roi_xmin + roi_height = roi_ymax - roi_ymin + if not self.aligned: + roi_width = max(roi_width, 1) + roi_height = max(roi_height, 1) bin_size_h = float(roi_height) / float(self.pooled_height) bin_size_w = float(roi_width) / float(self.pooled_width) roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \ @@ -192,11 +199,31 @@ class TestROIAlignInLodOp(TestROIAlignOp): 'spatial_scale': self.spatial_scale, 'pooled_height': self.pooled_height, 'pooled_width': self.pooled_width, - 'sampling_ratio': self.sampling_ratio + 'sampling_ratio': self.sampling_ratio, + 'aligned': self.aligned } self.outputs = {'Out': self.out_data} +class TestROIAlignOpWithAligned(TestROIAlignOp): + def init_test_case(self): + self.batch_size = 3 + self.channels = 3 + self.height = 8 + self.width = 6 + + # n, c, h, w + self.x_dim = (self.batch_size, self.channels, self.height, self.width) + + self.spatial_scale = 1.0 / 2.0 + self.pooled_height = 2 + self.pooled_width = 2 + self.sampling_ratio = -1 + self.aligned = True + + self.x = np.random.random(self.x_dim).astype('float64') + + if __name__ == '__main__': unittest.main()