diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index c2b7c27ab4adb5282ad7aa5f7a16c15f81ba5f5e..1915323f3c324f952c183049759945030e5728f5 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -46,10 +46,12 @@ if(WITH_GPU) set(TMPDEPS memory cub) endif() detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS}) + detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS}) detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS}) detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS}) else() detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc) endif() diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..8840765841d2bce3660f172be9e10c363977e678 --- /dev/null +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -0,0 +1,285 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +int const kThreadsPerBlock = sizeof(uint64_t) * 8; + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +struct RangeInitFunctor { + int start_; + int delta_; + int *out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; + +template +static void SortDescending(const platform::CUDADeviceContext &ctx, + const Tensor &value, Tensor *value_out, + Tensor *index_out) { + int num = static_cast(value.numel()); + Tensor index_in_t; + int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); + platform::ForRange for_range(ctx, num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); + + const T *keys_in = value.data(); + T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); + // Allocate temporary storage + auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, + idx_out, num); +} + +template +struct BoxDecodeAndClipFunctor { + const T *anchor; + const T *deltas; + const T *var; + const int *index; + const T *im_info; + + T *proposals; + + BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var, + const int *index, const T *im_info, T *proposals) + : anchor(anchor), + deltas(deltas), + var(var), + index(index), + im_info(im_info), + proposals(proposals) {} + + T bbox_clip_default{static_cast(kBBoxClipDefault)}; + + __device__ void operator()(size_t i) { + int k = index[i] * 4; + T axmin = anchor[k]; + T aymin = anchor[k + 1]; + T axmax = anchor[k + 2]; + T aymax = anchor[k + 3]; + + T w = axmax - axmin + 1.0; + T h = aymax - aymin + 1.0; + T cx = axmin + 0.5 * w; + T cy = aymin + 0.5 * h; + + T dxmin = deltas[k]; + T dymin = deltas[k + 1]; + T dxmax = deltas[k + 2]; + T dymax = deltas[k + 3]; + + T d_cx, d_cy, d_w, d_h; + if (var) { + d_cx = cx + dxmin * w * var[k]; + d_cy = cy + dymin * h * var[k + 1]; + d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w; + d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h; + } else { + d_cx = cx + dxmin * w; + d_cy = cy + dymin * h; + d_w = exp(Min(dxmax, bbox_clip_default)) * w; + d_h = exp(Min(dymax, bbox_clip_default)) * h; + } + + T oxmin = d_cx - d_w * 0.5; + T oymin = d_cy - d_h * 0.5; + T oxmax = d_cx + d_w * 0.5 - 1.; + T oymax = d_cy + d_h * 0.5 - 1.; + + proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); + } + + __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; } + + __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; } +}; + +template +static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, + const T min_size, const int num, + int *keep_num, int *keep, + bool is_scale = true) { + T im_h = im_info[0]; + T im_w = im_info[1]; + + int cnt = 0; + __shared__ int keep_index[BlockSize]; + + CUDA_KERNEL_LOOP(i, num) { + keep_index[threadIdx.x] = -1; + __syncthreads(); + + int k = i * 4; + T xmin = bboxes[k]; + T ymin = bboxes[k + 1]; + T xmax = bboxes[k + 2]; + T ymax = bboxes[k + 3]; + + T w = xmax - xmin + 1.0; + T h = ymax - ymin + 1.0; + T cx = xmin + w / 2.; + T cy = ymin + h / 2.; + + if (is_scale) { + w = (xmax - xmin) / im_info[2] + 1.; + h = (ymax - ymin) / im_info[2] + 1.; + } + + if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = i; + } + __syncthreads(); + if (threadIdx.x == 0) { + int size = (num - i) < BlockSize ? num - i : BlockSize; + for (int j = 0; j < size; ++j) { + if (keep_index[j] > -1) { + keep[cnt++] = keep_index[j]; + } + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + keep_num[0] = cnt; + } +} + +static __device__ float IoU(const float *a, const float *b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float inter_s = width * height; + float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return inter_s / (s_a + s_b - inter_s); +} + +static __global__ void NMSKernel(const int n_boxes, + const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = + min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock); + const int col_size = + min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock); + + __shared__ float block_boxes[kThreadsPerBlock * 4]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 4 + 0] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0]; + block_boxes[threadIdx.x * 4 + 1] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1]; + block_boxes[threadIdx.x * 4 + 2] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2]; + block_boxes[threadIdx.x * 4 + 3] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +template +static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, + const Tensor &sorted_indices, const T nms_threshold, + Tensor *keep_out) { + int boxes_num = proposals.dims()[0]; + const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); + dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), + DIVUP(boxes_num, kThreadsPerBlock)); + dim3 threads(kThreadsPerBlock); + + const T *boxes = proposals.data(); + auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + framework::Vector mask(boxes_num * col_blocks); + NMSKernel<<>>(boxes_num, nms_threshold, boxes, + mask.CUDAMutableData(BOOST_GET_CONST( + platform::CUDAPlace, ctx.GetPlace()))); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + std::vector keep_vec; + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / kThreadsPerBlock; + int inblock = i % kThreadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + ++num_to_keep; + keep_vec.push_back(i); + uint64_t *p = &mask[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); + memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), + sizeof(int) * num_to_keep, ctx.stream()); + ctx.Wait(); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 6c9fea1fd4419600fa93cae6315e6bd798de3248..b7a23c48fb8c7bdfd4ce4fd78f42c703ddcae43e 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -21,6 +21,8 @@ limitations under the License. */ namespace paddle { namespace operators { +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + struct RangeInitFunctor { int start; int delta; @@ -125,17 +127,45 @@ void BboxOverlaps(const framework::Tensor& r_boxes, } } +// Calculate max IoU between each box and ground-truth and +// each row represents one box +template +void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) { + const T* iou_data = iou.data(); + int row = iou.dims()[0]; + int col = iou.dims()[1]; + T* max_iou_data = max_iou->data(); + for (int i = 0; i < row; ++i) { + const T* v = iou_data + i * col; + T max_v = *std::max_element(v, v + col); + max_iou_data[i] = max_v; + } +} + +static void AppendProposals(framework::Tensor* dst, int64_t offset, + const framework::Tensor& src) { + auto* out_data = dst->data(); + auto* to_add_data = src.data(); + size_t size_of_t = framework::SizeOfType(src.type()); + offset *= size_of_t; + std::memcpy( + reinterpret_cast(reinterpret_cast(out_data) + offset), + to_add_data, src.numel() * size_of_t); +} + template void ClipTiledBoxes(const platform::DeviceContext& ctx, const framework::Tensor& im_info, const framework::Tensor& input_boxes, - framework::Tensor* out) { + framework::Tensor* out, bool is_scale = true) { T* out_data = out->mutable_data(ctx.GetPlace()); const T* im_info_data = im_info.data(); const T* input_boxes_data = input_boxes.data(); T zero(0); - T im_w = round(im_info_data[1] / im_info_data[2]); - T im_h = round(im_info_data[0] / im_info_data[2]); + T im_w = + is_scale ? round(im_info_data[1] / im_info_data[2]) : im_info_data[1]; + T im_h = + is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0]; for (int64_t i = 0; i < input_boxes.numel(); ++i) { if (i % 4 == 0) { out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); @@ -149,19 +179,101 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx, } } -// Calculate max IoU between each box and ground-truth and -// each row represents one box -template -void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) { - const T* iou_data = iou.data(); - int row = iou.dims()[0]; - int col = iou.dims()[1]; - T* max_iou_data = max_iou->data(); - for (int i = 0; i < row; ++i) { - const T* v = iou_data + i * col; - T max_v = *std::max_element(v, v + col); - max_iou_data[i] = max_v; +// Filter the box with small area +template +void FilterBoxes(const platform::DeviceContext& ctx, + const framework::Tensor* boxes, float min_size, + const framework::Tensor& im_info, bool is_scale, + framework::Tensor* keep) { + const T* im_info_data = im_info.data(); + const T* boxes_data = boxes->data(); + keep->Resize({boxes->dims()[0]}); + min_size = std::max(min_size, 1.0f); + int* keep_data = keep->mutable_data(ctx.GetPlace()); + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + + if (is_scale) { + ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1; + hs = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + 1; + } + + if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && + y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } + keep->Resize({keep_len}); +} + +template +static void BoxCoder(const platform::DeviceContext& ctx, + framework::Tensor* all_anchors, + framework::Tensor* bbox_deltas, + framework::Tensor* variances, + framework::Tensor* proposals) { + T* proposals_data = proposals->mutable_data(ctx.GetPlace()); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto* bbox_deltas_data = bbox_deltas->data(); + auto* anchor_data = all_anchors->data(); + const T* variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; + + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; } + // return proposals; } } // namespace operators diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 06e560f86d4e0a74f7ae04b155829618ce634697..2bf5e6c5e04da539622a072dae35948492a43243 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -18,6 +18,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/detection/nms_util.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/math_function.h" @@ -27,18 +29,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -static const double kBBoxClipDefault = std::log(1000.0 / 16.0); - -static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { - auto *out_data = dst->data(); - auto *to_add_data = src.data(); - size_t size_of_t = framework::SizeOfType(src.type()); - offset *= size_of_t; - std::memcpy( - reinterpret_cast(reinterpret_cast(out_data) + offset), - to_add_data, src.numel() * size_of_t); -} - class GenerateProposalsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -77,225 +67,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { } }; -template -static inline void BoxCoder(const platform::DeviceContext &ctx, - Tensor *all_anchors, Tensor *bbox_deltas, - Tensor *variances, Tensor *proposals) { - T *proposals_data = proposals->mutable_data(ctx.GetPlace()); - - int64_t row = all_anchors->dims()[0]; - int64_t len = all_anchors->dims()[1]; - - auto *bbox_deltas_data = bbox_deltas->data(); - auto *anchor_data = all_anchors->data(); - const T *variances_data = nullptr; - if (variances) { - variances_data = variances->data(); - } - - for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - - T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; - T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; - - T bbox_center_x = 0, bbox_center_y = 0; - T bbox_width = 0, bbox_height = 0; - - if (variances) { - bbox_center_x = - variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + - anchor_center_x; - bbox_center_y = variances_data[i * len + 1] * - bbox_deltas_data[i * len + 1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(std::min(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } else { - bbox_center_x = - bbox_deltas_data[i * len] * anchor_width + anchor_center_x; - bbox_center_y = - bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } - - proposals_data[i * len] = bbox_center_x - bbox_width / 2; - proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; - } - // return proposals; -} - -template -static inline void ClipTiledBoxes(const platform::DeviceContext &ctx, - const Tensor &im_info, Tensor *boxes) { - T *boxes_data = boxes->mutable_data(ctx.GetPlace()); - const T *im_info_data = im_info.data(); - T zero(0); - for (int64_t i = 0; i < boxes->numel(); ++i) { - if (i % 4 == 0) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); - } else if (i % 4 == 1) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); - } else if (i % 4 == 2) { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); - } else { - boxes_data[i] = - std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); - } - } -} - -template -static inline void FilterBoxes(const platform::DeviceContext &ctx, - Tensor *boxes, float min_size, - const Tensor &im_info, Tensor *keep) { - const T *im_info_data = im_info.data(); - T *boxes_data = boxes->mutable_data(ctx.GetPlace()); - T im_scale = im_info_data[2]; - keep->Resize({boxes->dims()[0]}); - min_size = std::max(min_size, 1.0f); - int *keep_data = keep->mutable_data(ctx.GetPlace()); - - int keep_len = 0; - for (int i = 0; i < boxes->dims()[0]; ++i) { - T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; - T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; - T ws_origin_scale = - (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; - T hs_origin_scale = - (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; - T x_ctr = boxes_data[4 * i] + ws / 2; - T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && - x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { - keep_data[keep_len++] = i; - } - } - keep->Resize({keep_len}); -} - -template -static inline std::vector> GetSortedScoreIndex( - const std::vector &scores) { - std::vector> sorted_indices; - sorted_indices.reserve(scores.size()); - for (size_t i = 0; i < scores.size(); ++i) { - sorted_indices.emplace_back(scores[i], i); - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices.begin(), sorted_indices.end(), - [](const std::pair &a, const std::pair &b) { - return a.first < b.first; - }); - return sorted_indices; -} - -template -static inline T BBoxArea(const T *box, bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; - } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); - } - } -} - -template -static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); - const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } -} - -template -static inline Tensor VectorToTensor(const std::vector &selected_indices, - int selected_num) { - Tensor keep_nms; - keep_nms.Resize({selected_num}); - auto *keep_data = keep_nms.mutable_data(platform::CPUPlace()); - for (int i = 0; i < selected_num; ++i) { - keep_data[i] = selected_indices[i]; - } - return keep_nms; -} - -template -static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, - Tensor *scores, T nms_threshold, float eta) { - int64_t num_boxes = bbox->dims()[0]; - // 4: [xmin ymin xmax ymax] - int64_t box_size = bbox->dims()[1]; - - std::vector scores_data(num_boxes); - std::copy_n(scores->data(), num_boxes, scores_data.begin()); - std::vector> sorted_indices = - GetSortedScoreIndex(scores_data); - - std::vector selected_indices; - int selected_num = 0; - T adaptive_threshold = nms_threshold; - const T *bbox_data = bbox->data(); - while (sorted_indices.size() != 0) { - int idx = sorted_indices.back().second; - bool flag = true; - for (int kept_idx : selected_indices) { - if (flag) { - T overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, false); - flag = (overlap <= adaptive_threshold); - } else { - break; - } - } - if (flag) { - selected_indices.push_back(idx); - ++selected_num; - } - sorted_indices.erase(sorted_indices.end() - 1); - if (flag && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; - } - } - return VectorToTensor(selected_indices, selected_num); -} - template class GenerateProposalsKernel : public framework::OpKernel { public: @@ -434,10 +205,10 @@ class GenerateProposalsKernel : public framework::OpKernel { proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); - ClipTiledBoxes(ctx, im_info_slice, &proposals); + ClipTiledBoxes(ctx, im_info_slice, proposals, &proposals, false); Tensor keep; - FilterBoxes(ctx, &proposals, min_size, im_info_slice, &keep); + FilterBoxes(ctx, &proposals, min_size, im_info_slice, true, &keep); // Handle the case when there is no keep index left if (keep.numel() == 0) { math::SetConstant set_zero; diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 71323ea966a6cffe6a1750d96083a68f0a686bfb..8359fbab519b36f58fbeaf02082f02a1372993fc 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -16,13 +16,11 @@ limitations under the License. */ #include #include #include -#include "cub/cub.cuh" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/detection/bbox_util.cu.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { @@ -31,258 +29,6 @@ using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; namespace { - -#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) - -int const kThreadsPerBlock = sizeof(uint64_t) * 8; - -static const double kBBoxClipDefault = std::log(1000.0 / 16.0); - -struct RangeInitFunctor { - int start_; - int delta_; - int *out_; - __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } -}; - -template -static void SortDescending(const platform::CUDADeviceContext &ctx, - const Tensor &value, Tensor *value_out, - Tensor *index_out) { - int num = static_cast(value.numel()); - Tensor index_in_t; - int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); - platform::ForRange for_range(ctx, num); - for_range(RangeInitFunctor{0, 1, idx_in}); - - int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); - - const T *keys_in = value.data(); - T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); - - // Determine temporary device storage requirements - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); - // Allocate temporary storage - auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - - // Run sorting operation - cub::DeviceRadixSort::SortPairsDescending( - d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, - idx_out, num); -} - -template -struct BoxDecodeAndClipFunctor { - const T *anchor; - const T *deltas; - const T *var; - const int *index; - const T *im_info; - - T *proposals; - - BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var, - const int *index, const T *im_info, T *proposals) - : anchor(anchor), - deltas(deltas), - var(var), - index(index), - im_info(im_info), - proposals(proposals) {} - - T bbox_clip_default{static_cast(kBBoxClipDefault)}; - - __device__ void operator()(size_t i) { - int k = index[i] * 4; - T axmin = anchor[k]; - T aymin = anchor[k + 1]; - T axmax = anchor[k + 2]; - T aymax = anchor[k + 3]; - - T w = axmax - axmin + 1.0; - T h = aymax - aymin + 1.0; - T cx = axmin + 0.5 * w; - T cy = aymin + 0.5 * h; - - T dxmin = deltas[k]; - T dymin = deltas[k + 1]; - T dxmax = deltas[k + 2]; - T dymax = deltas[k + 3]; - - T d_cx, d_cy, d_w, d_h; - if (var) { - d_cx = cx + dxmin * w * var[k]; - d_cy = cy + dymin * h * var[k + 1]; - d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w; - d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h; - } else { - d_cx = cx + dxmin * w; - d_cy = cy + dymin * h; - d_w = exp(Min(dxmax, bbox_clip_default)) * w; - d_h = exp(Min(dymax, bbox_clip_default)) * h; - } - - T oxmin = d_cx - d_w * 0.5; - T oymin = d_cy - d_h * 0.5; - T oxmax = d_cx + d_w * 0.5 - 1.; - T oymax = d_cy + d_h * 0.5 - 1.; - - proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); - proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); - proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); - proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); - } - - __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; } - - __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; } -}; - -template -static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, - const T min_size, const int num, - int *keep_num, int *keep) { - T im_h = im_info[0]; - T im_w = im_info[1]; - T im_scale = im_info[2]; - - int cnt = 0; - __shared__ int keep_index[BlockSize]; - - CUDA_KERNEL_LOOP(i, num) { - keep_index[threadIdx.x] = -1; - __syncthreads(); - - int k = i * 4; - T xmin = bboxes[k]; - T ymin = bboxes[k + 1]; - T xmax = bboxes[k + 2]; - T ymax = bboxes[k + 3]; - - T w = xmax - xmin + 1.0; - T h = ymax - ymin + 1.0; - T cx = xmin + w / 2.; - T cy = ymin + h / 2.; - - T w_s = (xmax - xmin) / im_scale + 1.; - T h_s = (ymax - ymin) / im_scale + 1.; - - if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) { - keep_index[threadIdx.x] = i; - } - __syncthreads(); - if (threadIdx.x == 0) { - int size = (num - i) < BlockSize ? num - i : BlockSize; - for (int j = 0; j < size; ++j) { - if (keep_index[j] > -1) { - keep[cnt++] = keep_index[j]; - } - } - } - __syncthreads(); - } - if (threadIdx.x == 0) { - keep_num[0] = cnt; - } -} - -static __device__ inline float IoU(const float *a, const float *b) { - float left = max(a[0], b[0]), right = min(a[2], b[2]); - float top = max(a[1], b[1]), bottom = min(a[3], b[3]); - float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); - float inter_s = width * height; - float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); - float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); - return inter_s / (s_a + s_b - inter_s); -} - -static __global__ void NMSKernel(const int n_boxes, - const float nms_overlap_thresh, - const float *dev_boxes, uint64_t *dev_mask) { - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - - const int row_size = - min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock); - const int col_size = - min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock); - - __shared__ float block_boxes[kThreadsPerBlock * 4]; - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 4 + 0] = - dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0]; - block_boxes[threadIdx.x * 4 + 1] = - dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1]; - block_boxes[threadIdx.x * 4 + 2] = - dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2]; - block_boxes[threadIdx.x * 4 + 3] = - dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3]; - } - __syncthreads(); - - if (threadIdx.x < row_size) { - const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x; - const float *cur_box = dev_boxes + cur_box_idx * 4; - int i = 0; - uint64_t t = 0; - int start = 0; - if (row_start == col_start) { - start = threadIdx.x + 1; - } - for (i = start; i < col_size; i++) { - if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { - t |= 1ULL << i; - } - } - const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock); - dev_mask[cur_box_idx * col_blocks + col_start] = t; - } -} - -template -static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, - const Tensor &sorted_indices, const T nms_threshold, - Tensor *keep_out) { - int boxes_num = proposals.dims()[0]; - const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); - dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), - DIVUP(boxes_num, kThreadsPerBlock)); - dim3 threads(kThreadsPerBlock); - - const T *boxes = proposals.data(); - auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - framework::Vector mask(boxes_num * col_blocks); - NMSKernel<<>>(boxes_num, nms_threshold, boxes, - mask.CUDAMutableData(BOOST_GET_CONST( - platform::CUDAPlace, ctx.GetPlace()))); - - std::vector remv(col_blocks); - memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); - - std::vector keep_vec; - int num_to_keep = 0; - for (int i = 0; i < boxes_num; i++) { - int nblock = i / kThreadsPerBlock; - int inblock = i % kThreadsPerBlock; - - if (!(remv[nblock] & (1ULL << inblock))) { - ++num_to_keep; - keep_vec.push_back(i); - uint64_t *p = &mask[0] + i * col_blocks; - for (int j = nblock; j < col_blocks; j++) { - remv[j] |= p[j]; - } - } - } - int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); - memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), - sizeof(int) * num_to_keep, ctx.stream()); - ctx.Wait(); -} - template static std::pair ProposalForOneImage( const platform::CUDADeviceContext &ctx, const Tensor &im_info, diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c2fd599fa6a2eee081361d0615efaa387563c50 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -0,0 +1,314 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/detection/nms_util.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class GenerateProposalsV2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("Scores"), true, + platform::errors::NotFound("Input(Scores) shouldn't be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("BboxDeltas"), true, + platform::errors::NotFound("Input(BboxDeltas) shouldn't be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("ImShape"), true, + platform::errors::NotFound("Input(ImShape) shouldn't be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Anchors"), true, + platform::errors::NotFound("Input(Anchors) shouldn't be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Variances"), true, + platform::errors::NotFound("Input(Variances) shouldn't be null.")); + + ctx->SetOutputDim("RpnRois", {-1, 4}); + ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); + if (!ctx->IsRuntime()) { + ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1)); + ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1)); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Anchors"), + ctx.device_context()); + } +}; + +template +class GenerateProposalsV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_shape = context.Input("ImShape"); + auto anchors = GET_DATA_SAFELY(context.Input("Anchors"), "Input", + "Anchors", "GenerateProposals"); + auto variances = GET_DATA_SAFELY(context.Input("Variances"), + "Input", "Variances", "GenerateProposals"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + + auto &dev_ctx = + context.template device_context(); + + auto &scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto &bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + framework::LoD lod; + lod.resize(1); + auto &lod0 = lod[0]; + lod0.push_back(0); + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); + std::vector tmp_num; + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + Tensor im_shape_slice = im_shape->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair tensor_pair = + ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + Tensor &proposals = tensor_pair.first; + Tensor &scores = tensor_pair.second; + + AppendProposals(rpn_rois, 4 * num_proposals, proposals); + AppendProposals(rpn_roi_probs, num_proposals, scores); + num_proposals += proposals.dims()[0]; + lod0.push_back(num_proposals); + tmp_num.push_back(proposals.dims()[0]); + } + if (context.HasOutput("RpnRoisNum")) { + auto *rpn_rois_num = context.Output("RpnRoisNum"); + rpn_rois_num->mutable_data({num}, context.GetPlace()); + int *num_data = rpn_rois_num->data(); + for (int i = 0; i < num; i++) { + num_data[i] = tmp_num[i]; + } + rpn_rois_num->Resize({num}); + } + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } + + std::pair ProposalForOneImage( + const platform::CPUDeviceContext &ctx, const Tensor &im_shape_slice, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas_slice, // [M, 4] + const Tensor &scores_slice, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) const { + auto *scores_data = scores_slice.data(); + + // Sort index + Tensor index_t; + index_t.Resize({scores_slice.numel()}); + int *index = index_t.mutable_data(ctx.GetPlace()); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + auto compare = [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, + index + scores_slice.numel(), compare); + index_t.Resize({pre_nms_top_n}); + } + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_t.numel(), 1}, ctx.GetPlace()); + bbox_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + + CPUGather(ctx, scores_slice, index_t, &scores_sel); + CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + CPUGather(ctx, anchors, index_t, &anchor_sel); + CPUGather(ctx, variances, index_t, &var_sel); + + Tensor proposals; + proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); + + ClipTiledBoxes(ctx, im_shape_slice, proposals, &proposals, false); + + Tensor keep; + FilterBoxes(ctx, &proposals, min_size, im_shape_slice, false, &keep); + // Handle the case when there is no keep index left + if (keep.numel() == 0) { + math::SetConstant set_zero; + bbox_sel.mutable_data({1, 4}, ctx.GetPlace()); + set_zero(ctx, &bbox_sel, static_cast(0)); + Tensor scores_filter; + scores_filter.mutable_data({1, 1}, ctx.GetPlace()); + set_zero(ctx, &scores_filter, static_cast(0)); + return std::make_pair(bbox_sel, scores_filter); + } + + Tensor scores_filter; + bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, proposals, keep, &bbox_sel); + CPUGather(ctx, scores_sel, keep, &scores_filter); + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_filter); + } + + Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, bbox_sel, keep_nms, &proposals); + CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + + return std::make_pair(proposals, scores_sel); + } +}; + +class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Scores", + "(Tensor) The scores from conv is in shape (N, A, H, W), " + "N is batch size, A is number of anchors, " + "H and W are height and width of the feature map"); + AddInput("BboxDeltas", + "(Tensor) Bounding box deltas from conv is in " + "shape (N, 4*A, H, W)."); + AddInput("ImShape", + "(Tensor) Image shape in shape (N, 2), " + "in format (height, width)"); + AddInput("Anchors", + "(Tensor) Bounding box anchors from anchor_generator_op " + "is in shape (A, H, W, 4)."); + AddInput("Variances", + "(Tensor) Bounding box variances with same shape as `Anchors`."); + + AddOutput("RpnRois", + "(LoDTensor), Output proposals with shape (rois_num, 4)."); + AddOutput("RpnRoiProbs", + "(LoDTensor) Scores of proposals with shape (rois_num, 1)."); + AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image") + .AsDispensable(); + AddAttr("pre_nms_topN", + "Number of top scoring RPN proposals to keep before " + "applying NMS."); + AddAttr("post_nms_topN", + "Number of top scoring RPN proposals to keep after " + "applying NMS"); + AddAttr("nms_thresh", "NMS threshold used on RPN proposals."); + AddAttr("min_size", + "Proposal height and width both need to be greater " + "than this min_size."); + AddAttr("eta", "The parameter for adaptive NMS."); + AddComment(R"DOC( +This operator is the second version of generate_proposals op to generate +bounding box proposals for Faster RCNN. +The proposals are generated for a list of images based on image +score 'Scores', bounding box regression result 'BboxDeltas' as +well as predefined bounding box shapes 'anchors'. Greedy +non-maximum suppression is applied to generate the final bounding +boxes. + +The difference between this version and the first version is that the image + scale is no long needed now, so the input requires im_shape instead of im_info. +The change aims to unify the input for all kinds of objective detection +such as YOLO-v3 and Faster R-CNN. As a result, the min_size represents the +size on input image instead of original image which is slightly different +to before and will not effect the result. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + generate_proposals_v2, ops::GenerateProposalsV2Op, + ops::GenerateProposalsV2OpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(generate_proposals_v2, + ops::GenerateProposalsV2Kernel, + ops::GenerateProposalsV2Kernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..70020cdc64ef58a47624bd0ce27898e86623dfa3 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -0,0 +1,229 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +namespace { +template +static std::pair ProposalForOneImage( + const platform::CUDADeviceContext &ctx, const Tensor &im_shape, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas, // [M, 4] + const Tensor &scores, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + // 1. pre nms + Tensor scores_sort, index_sort; + SortDescending(ctx, scores, &scores_sort, &index_sort); + int num = scores.numel(); + int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() + : pre_nms_top_n; + scores_sort.Resize({pre_nms_num, 1}); + index_sort.Resize({pre_nms_num, 1}); + + // 2. box decode and clipping + Tensor proposals; + proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); + + { + platform::ForRange for_range(ctx, pre_nms_num); + for_range(BoxDecodeAndClipFunctor{ + anchors.data(), bbox_deltas.data(), variances.data(), + index_sort.data(), im_shape.data(), proposals.data()}); + } + + // 3. filter + Tensor keep_index, keep_num_t; + keep_index.mutable_data({pre_nms_num}, ctx.GetPlace()); + keep_num_t.mutable_data({1}, ctx.GetPlace()); + min_size = std::max(min_size, 1.0f); + auto stream = ctx.stream(); + FilterBBoxes<<<1, 512, 0, stream>>>( + proposals.data(), im_shape.data(), min_size, pre_nms_num, + keep_num_t.data(), keep_index.data(), false); + int keep_num; + const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, + keep_num_t.data(), sizeof(int), ctx.stream()); + ctx.Wait(); + keep_index.Resize({keep_num}); + + Tensor scores_filter, proposals_filter; + // Handle the case when there is no keep index left + if (keep_num == 0) { + math::SetConstant set_zero; + proposals_filter.mutable_data({1, 4}, ctx.GetPlace()); + scores_filter.mutable_data({1, 1}, ctx.GetPlace()); + set_zero(ctx, &proposals_filter, static_cast(0)); + set_zero(ctx, &scores_filter, static_cast(0)); + return std::make_pair(proposals_filter, scores_filter); + } + proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); + GPUGather(ctx, proposals, keep_index, &proposals_filter); + GPUGather(ctx, scores_sort, keep_index, &scores_filter); + + if (nms_thresh <= 0) { + return std::make_pair(proposals_filter, scores_filter); + } + + // 4. nms + Tensor keep_nms; + NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + Tensor scores_nms, proposals_nms; + proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + + return std::make_pair(proposals_nms, scores_nms); +} +} // namespace + +template +class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_shape = context.Input("ImShape"); + auto anchors = GET_DATA_SAFELY(context.Input("Anchors"), "Input", + "Anchors", "GenerateProposals"); + auto variances = GET_DATA_SAFELY(context.Input("Variances"), + "Input", "Variances", "GenerateProposals"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + PADDLE_ENFORCE_GE(eta, 1., + platform::errors::InvalidArgument( + "Not support adaptive NMS. The attribute 'eta' " + "should not less than 1. But received eta=[%d]", + eta)); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + T *rpn_rois_data = rpn_rois->data(); + T *rpn_roi_probs_data = rpn_roi_probs->data(); + + auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + auto cpu_place = platform::CPUPlace(); + + int64_t num_proposals = 0; + std::vector offset(1, 0); + std::vector tmp_num; + + for (int64_t i = 0; i < num; ++i) { + Tensor im_shape_slice = im_shape->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair box_score_pair = + ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + + Tensor &proposals = box_score_pair.first; + Tensor &scores = box_score_pair.second; + + memory::Copy(place, rpn_rois_data + num_proposals * 4, place, + proposals.data(), sizeof(T) * proposals.numel(), + dev_ctx.stream()); + memory::Copy(place, rpn_roi_probs_data + num_proposals, place, + scores.data(), sizeof(T) * scores.numel(), + dev_ctx.stream()); + dev_ctx.Wait(); + num_proposals += proposals.dims()[0]; + offset.emplace_back(num_proposals); + tmp_num.push_back(proposals.dims()[0]); + } + if (context.HasOutput("RpnRoisNum")) { + auto *rpn_rois_num = context.Output("RpnRoisNum"); + rpn_rois_num->mutable_data({num}, context.GetPlace()); + int *num_data = rpn_rois_num->data(); + memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num, + dev_ctx.stream()); + rpn_rois_num->Resize({num}); + } + framework::LoD lod; + lod.emplace_back(offset); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(generate_proposals_v2, + ops::CUDAGenerateProposalsV2Kernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h index 067bfce51949c7526ebe87bb51722327691db555..febdee8263553064ca1f59124fdf29df62aee1d8 100644 --- a/paddle/fluid/operators/detection/nms_util.h +++ b/paddle/fluid/operators/detection/nms_util.h @@ -99,5 +99,74 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size, } } +template +static inline std::vector> GetSortedScoreIndex( + const std::vector& scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices.emplace_back(scores[i], i); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + }); + return sorted_indices; +} + +template +static inline framework::Tensor VectorToTensor( + const std::vector& selected_indices, int selected_num) { + framework::Tensor keep_nms; + keep_nms.Resize({selected_num}); + auto* keep_data = keep_nms.mutable_data(platform::CPUPlace()); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + return keep_nms; +} + +template +framework::Tensor NMS(const platform::DeviceContext& ctx, + framework::Tensor* bbox, framework::Tensor* scores, + T nms_threshold, float eta) { + int64_t num_boxes = bbox->dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores->data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices = + GetSortedScoreIndex(scores_data); + + std::vector selected_indices; + int selected_num = 0; + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox->data(); + while (sorted_indices.size() != 0) { + int idx = sorted_indices.back().second; + bool flag = true; + for (int kept_idx : selected_indices) { + if (flag) { + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + ++selected_num; + } + sorted_indices.erase(sorted_indices.end() - 1); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + return VectorToTensor(selected_indices, selected_num); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 92006bff2cc16cb917692818f9bd74c85d8bb476..10914cf0ab7ba2292e59847bafdff1ce23a730e1 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -81,6 +81,7 @@ std::map> op_outs_map = { {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}}, {"multiclass_nms3", {"Out", "NmsRoisNum"}}, + {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py new file mode 100644 index 0000000000000000000000000000000000000000..26c443008db5049a3e081243c1cb5b509e5d27e0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py @@ -0,0 +1,238 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +import math +import paddle +import paddle.fluid as fluid +from op_test import OpTest +from test_multiclass_nms_op import nms +from test_anchor_generator_op import anchor_generator_in_python +import copy +from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms + + +def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors, + variances, pre_nms_topN, post_nms_topN, + nms_thresh, min_size, eta): + all_anchors = anchors.reshape(-1, 4) + rois = np.empty((0, 5), dtype=np.float32) + roi_probs = np.empty((0, 1), dtype=np.float32) + + rpn_rois = [] + rpn_roi_probs = [] + rois_num = [] + num_images = scores.shape[0] + for img_idx in range(num_images): + img_i_boxes, img_i_probs = proposal_for_one_image( + im_shape[img_idx, :], all_anchors, variances, + bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :], + pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta) + rois_num.append(img_i_probs.shape[0]) + rpn_rois.append(img_i_boxes) + rpn_roi_probs.append(img_i_probs) + + return rpn_rois, rpn_roi_probs, rois_num + + +def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas, + scores, pre_nms_topN, post_nms_topN, nms_thresh, + min_size, eta): + # Transpose and reshape predicted bbox transformations to get them + # into the same order as the anchors: + # - bbox deltas will be (4 * A, H, W) format from conv output + # - transpose to (H, W, 4 * A) + # - reshape to (H * W * A, 4) where rows are ordered by (H, W, A) + # in slowest to fastest order to match the enumerated anchors + bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4) + all_anchors = all_anchors.reshape(-1, 4) + variances = variances.reshape(-1, 4) + # Same story for the scores: + # - scores are (A, H, W) format from conv output + # - transpose to (H, W, A) + # - reshape to (H * W * A, 1) where rows are ordered by (H, W, A) + # to match the order of anchors and bbox_deltas + scores = scores.transpose((1, 2, 0)).reshape(-1, 1) + + # sort all (proposal, score) pairs by score from highest to lowest + # take top pre_nms_topN (e.g. 6000) + if pre_nms_topN <= 0 or pre_nms_topN >= len(scores): + order = np.argsort(-scores.squeeze()) + else: + # Avoid sorting possibly large arrays; + # First partition to get top K unsorted + # and then sort just those + inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN] + order = np.argsort(-scores[inds].squeeze()) + order = inds[order] + scores = scores[order, :] + bbox_deltas = bbox_deltas[order, :] + all_anchors = all_anchors[order, :] + proposals = box_coder(all_anchors, bbox_deltas, variances) + # clip proposals to image (may result in proposals with zero area + # that will be removed in the next step) + proposals = clip_tiled_boxes(proposals, im_shape) + # remove predicted boxes with height or width < min_size + keep = filter_boxes(proposals, min_size, im_shape) + if len(keep) == 0: + proposals = np.zeros((1, 4)).astype('float32') + scores = np.zeros((1, 1)).astype('float32') + return proposals, scores + proposals = proposals[keep, :] + scores = scores[keep, :] + + # apply loose nms (e.g. threshold = 0.7) + # take post_nms_topN (e.g. 1000) + # return the top proposals + if nms_thresh > 0: + keep = nms(boxes=proposals, + scores=scores, + nms_threshold=nms_thresh, + eta=eta) + if post_nms_topN > 0 and post_nms_topN < len(keep): + keep = keep[:post_nms_topN] + proposals = proposals[keep, :] + scores = scores[keep, :] + + return proposals, scores + + +def filter_boxes(boxes, min_size, im_shape): + """Only keep boxes with both sides >= min_size and center within the image. + """ + # Scale min_size to match image scale + min_size = max(min_size, 1.0) + ws = boxes[:, 2] - boxes[:, 0] + 1 + hs = boxes[:, 3] - boxes[:, 1] + 1 + x_ctr = boxes[:, 0] + ws / 2. + y_ctr = boxes[:, 1] + hs / 2. + keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[1]) + & (y_ctr < im_shape[0]))[0] + return keep + + +class TestGenerateProposalsV2Op(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = { + 'Scores': self.scores, + 'BboxDeltas': self.bbox_deltas, + 'ImShape': self.im_shape.astype(np.float32), + 'Anchors': self.anchors, + 'Variances': self.variances + } + + self.attrs = { + 'pre_nms_topN': self.pre_nms_topN, + 'post_nms_topN': self.post_nms_topN, + 'nms_thresh': self.nms_thresh, + 'min_size': self.min_size, + 'eta': self.eta + } + + self.outputs = { + 'RpnRois': (self.rpn_rois[0], [self.rois_num]), + 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]), + } + + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "generate_proposals_v2" + self.set_data() + + def init_test_params(self): + self.pre_nms_topN = 12000 # train 12000, test 2000 + self.post_nms_topN = 5000 # train 6000, test 1000 + self.nms_thresh = 0.7 + self.min_size = 3.0 + self.eta = 1. + + def init_test_input(self): + batch_size = 1 + input_channels = 20 + layer_h = 16 + layer_w = 16 + input_feat = np.random.random( + (batch_size, input_channels, layer_h, layer_w)).astype('float32') + self.anchors, self.variances = anchor_generator_in_python( + input_feat=input_feat, + anchor_sizes=[16., 32.], + aspect_ratios=[0.5, 1.0], + variances=[1.0, 1.0, 1.0, 1.0], + stride=[16.0, 16.0], + offset=0.5) + self.im_shape = np.array([[64, 64]]).astype('float32') + num_anchors = self.anchors.shape[2] + self.scores = np.random.random( + (batch_size, num_anchors, layer_h, layer_w)).astype('float32') + self.bbox_deltas = np.random.random( + (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32') + + def init_test_output(self): + self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_v2_in_python( + self.scores, self.bbox_deltas, self.im_shape, self.anchors, + self.variances, self.pre_nms_topN, self.post_nms_topN, + self.nms_thresh, self.min_size, self.eta) + + +class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = { + 'Scores': self.scores, + 'BboxDeltas': self.bbox_deltas, + 'ImShape': self.im_shape.astype(np.float32), + 'Anchors': self.anchors, + 'Variances': self.variances + } + + self.attrs = { + 'pre_nms_topN': self.pre_nms_topN, + 'post_nms_topN': self.post_nms_topN, + 'nms_thresh': self.nms_thresh, + 'min_size': self.min_size, + 'eta': self.eta, + 'return_rois_num': True + } + + self.outputs = { + 'RpnRois': (self.rpn_rois[0], [self.rois_num]), + 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]), + 'RpnRoisNum': (np.asarray( + self.rois_num, dtype=np.int32)) + } + + +class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op): + def init_test_params(self): + self.pre_nms_topN = 12000 # train 12000, test 2000 + self.post_nms_topN = 5000 # train 6000, test 1000 + self.nms_thresh = 0.7 + self.min_size = 1000.0 + self.eta = 1. + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index defa4f13495d28e3044422015b6737029a9374bb..be1166371944123c92b5205768b18c330dd1a005 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -673,4 +673,5 @@ STATIC_MODE_TESTING_LIST = [ 'test_sgd_op_xpu', 'test_shape_op_xpu', 'test_slice_op_xpu', + 'test_generate_proposals_v2_op', ]