From ca535d18ab808785cc969c8c8f96413536cd7926 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Fri, 8 Dec 2017 12:43:39 +0800 Subject: [PATCH] add detection_output code only --- paddle/operators/detection_output_op.cc | 91 +++++++ paddle/operators/detection_output_op.cu.cc | 21 ++ paddle/operators/detection_output_op.h | 114 ++++++++ paddle/operators/math/detection_util.h | 292 +++++++++++++++++++++ 4 files changed, 518 insertions(+) create mode 100644 paddle/operators/detection_output_op.cc create mode 100644 paddle/operators/detection_output_op.cu.cc create mode 100644 paddle/operators/detection_output_op.h create mode 100644 paddle/operators/math/detection_util.h diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc new file mode 100644 index 00000000000..c018795fd4b --- /dev/null +++ b/paddle/operators/detection_output_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/detection_output_op.h" +namespace paddle { +namespace operators { + +class Detection_output_OpMaker : public framework::OpProtoAndCheckerMaker { + public: + Detection_output_OpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput( + "Loc", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput( + "Conf", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput( + "PriorBox", + "(Tensor) The input tensor of detection_output operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of detection_output operator." + "N * M." + "M = C * H * W"); + AddAttr("background_label_id", "(int), multi level pooling"); + AddAttr("num_classes", "(int), multi level pooling"); + AddAttr("nms_threshold", "(int), multi level pooling"); + AddAttr("confidence_threshold", "(int), multi level pooling"); + AddAttr("top_k", "(int), multi level pooling"); + AddAttr("nms_top_k", "(int), multi level pooling"); + AddComment(R"DOC( + "Does spatial pyramid pooling on the input image by taking the max, + etc. within regions so that the result vector of different sized + images are of the same size + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Output shape: $(H_{out}, W_{out})$ + Where + $$ + H_{out} = N \\ + W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in} + $$ + )DOC"); + } +}; + +class Detection_output_Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Detection_output_Op" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Detection_output_Op should not be null."); + auto in_x_dims = ctx->GetInputDim("X"); + int pyramid_height = ctx->Attrs().Get("pyramid_height"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Detection_output_ing intput must be of 4-dimensional."); + int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1]; + std::vector output_shape({in_x_dims[0], outlen}); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::Detection_output_Op, + ops::Detection_output_OpMaker); +REGISTER_OP_CPU_KERNEL( + detection_output, + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc new file mode 100644 index 00000000000..8edcfc0be35 --- /dev/null +++ b/paddle/operators/detection_output_op.cu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/detection_output_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL( + detection_output, + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h new file mode 100644 index 00000000000..184b864974d --- /dev/null +++ b/paddle/operators/detection_output_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/framework/tensor.h" +#include "paddle/operators/math/detection_util.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/softmax.h" + +namespace paddle { +namespace operators { +template +class Detection_output_Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_loc = context.Input("Loc"); + const framework::Tensor* in_conf = context.Input("Conf"); + const framework::Tensor* in_priorbox = + context.Input("PriorBox"); + auto* out = context.Output("Out"); + int num_classes = context.template Attr("num_classes"); + int top_k = context.template Attr("top_k"); + int nms_top_k = context.template Attr("nms_top_k"); + int background_label_id = context.template Attr("background_label_id"); + float nms_threshold = context.template Attr("nms_threshold"); + float confidence_threshold = + context.template Attr("confidence_threshold"); + + int input_num = in_loc->dims()[0]; + int batch_size = in_loc->dims()[1]; + int loc_sum_size = in_loc->numel(); + int conf_sum_size = in_conf->numel(); + std::vector loc_shape_vec({1, loc_sum_size}); + std::vector conf_shape_vec( + {conf_sum_size / num_classes, num_classes}); + framework::DDim loc_shape(framework::make_ddim(loc_shape_vec)); + framework::DDim conf_shape(framework::make_ddim(conf_shape_vec)); + framework::Tensor loc_tensor; + framework::Tensor conf_tensor; + loc_tensor.mutable_data(loc_shape, context.GetPlace()); + conf_tensor.mutable_data(conf_shape, context.GetPlace()); + + // KNCHW ==> NHWC + for (int i = 0; i < input_num; ++i) { + math::appendWithPermute(*in_loc, &loc_tensor); + math::appendWithPermute(*in_conf, &conf_tensor); + } + // softmax + math::SoftmaxFunctor()(context.device_context(), &conf_tensor, + &conf_tensor); + // get decode bboxes + size_t num_priors = in_priorbox->numel() / 8; + std::vector>> all_decoded_bboxes; + for (size_t n = 0; n < batch_size; ++n) { + std::vector> decoded_bboxes; + for (size_t i = 0; i < num_priors; ++i) { + size_t prior_offset = i * 8; + size_t loc_pred_offset = n * num_priors * 4 + i * 4; + std::vector> prior_bbox_vec; + math::getBBoxFromPriorData(in_priorbox->data() + prior_offset, 1, + prior_bbox_vec); + std::vector> prior_bbox_var; + math::getBBoxVarFromPriorData(in_priorbox->data() + prior_offset, + 1, prior_bbox_var); + std::vector loc_pred_data; + for (size_t j = 0; j < 4; ++j) + loc_pred_data.push_back( + *(loc_tensor.data() + loc_pred_offset + j)); + math::BBox bbox = math::decodeBBoxWithVar( + prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data); + decoded_bboxes.push_back(bbox); + } + all_decoded_bboxes.push_back(decoded_bboxes); + } + + std::vector>> all_indices; + int num_kept = math::getDetectionIndices( + conf_tensor.data(), num_priors, num_classes, background_label_id, + batch_size, confidence_threshold, nms_top_k, nms_threshold, top_k, + all_decoded_bboxes, &all_indices); + + framework::Tensor out_tmp; + if (num_kept <= 0) { + std::vector out_shape_vec({0, 0}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out->Resize(out_shape); + return; + } + std::vector out_shape_vec({num_kept, 7}); + framework::DDim out_shape(framework::make_ddim(out_shape_vec)); + out_tmp.mutable_data(out_shape, context.GetPlace()); + + T* out_data = out_tmp.data(); + math::getDetectionOutput(conf_tensor.data(), num_kept, num_priors, + num_classes, batch_size, all_indices, + all_decoded_bboxes, out_data); + out->mutable_data(out_shape, context.GetPlace()); + out->ShareDataWith(out_tmp); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h new file mode 100644 index 00000000000..265fa077010 --- /dev/null +++ b/paddle/operators/math/detection_util.h @@ -0,0 +1,292 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "paddle/framework/selected_rows.h" +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +template +struct BBox { + BBox(T x_min, T y_min, T x_max, T y_max) + : x_min(x_min), + y_min(y_min), + x_max(x_max), + y_max(y_max), + is_difficult(false) {} + + BBox() {} + + T get_width() const { return x_max - x_min; } + + T get_height() const { return y_max - y_min; } + + T get_center_x() const { return (x_min + x_max) / 2; } + + T get_center_y() const { return (y_min + y_max) / 2; } + + T get_area() const { return get_width() * get_height(); } + + // coordinate of bounding box + T x_min; + T y_min; + T x_max; + T y_max; + // whether difficult object (e.g. object with heavy occlusion is difficult) + bool is_difficult; +}; +// KNCHW ==> NHWC +template +int appendWithPermute(const framework::Tensor& input, + framework::Tensor* output) { + const int input_nums = input.dims()[0]; + const int batch_size = input.dims()[1]; + const int channels = input.dims()[2]; + const int height = input.dims()[3]; + const int weight = input.dims()[4]; + int image_size = height * weight; + int offset = 0; + for (int p = 0; p < input_nums; ++p) { + int in_p_offset = p * batch_size * channels * image_size; + for (int n = 0; n < batch_size; ++n) { + int in_n_offset = n * channels * image_size; + int out_n_offset = n * input.numel() / batch_size + offset; + int in_stride = image_size; + int out_stride = channels; + const T* in_data = input.data() + in_p_offset + in_n_offset; + T* out_data = output->data() + out_n_offset; + for (int i = 0; i < channels; ++i) { + for (int c = 0; c < image_size; ++c) { + out_data[out_stride * c + i] = in_data[i * in_stride + c]; + } + } + } + offset += image_size * channels; + } + return 0; +} +template +void getBBoxFromPriorData(const T* prior_data, const size_t num_bboxes, + std::vector>& bbox_vec) { + size_t out_offset = bbox_vec.size(); + bbox_vec.resize(bbox_vec.size() + num_bboxes); + for (size_t i = 0; i < num_bboxes; ++i) { + BBox bbox; + bbox.x_min = *(prior_data + i * 8); + bbox.y_min = *(prior_data + i * 8 + 1); + bbox.x_max = *(prior_data + i * 8 + 2); + bbox.y_max = *(prior_data + i * 8 + 3); + bbox_vec[out_offset + i] = bbox; + } +} +template +void getBBoxVarFromPriorData(const T* prior_data, const size_t num, + std::vector>& var_vec) { + size_t out_offset = var_vec.size(); + var_vec.resize(var_vec.size() + num); + for (size_t i = 0; i < num; ++i) { + std::vector var; + var.push_back(*(prior_data + i * 8 + 4)); + var.push_back(*(prior_data + i * 8 + 5)); + var.push_back(*(prior_data + i * 8 + 6)); + var.push_back(*(prior_data + i * 8 + 7)); + var_vec[out_offset + i] = var; + } +} +template +BBox decodeBBoxWithVar(BBox& prior_bbox, + const std::vector& prior_bbox_var, + const std::vector& loc_pred_data) { + T prior_bbox_width = prior_bbox.get_width(); + T prior_bbox_height = prior_bbox.get_height(); + T prior_bbox_center_x = prior_bbox.get_center_x(); + T prior_bbox_center_y = prior_bbox.get_center_y(); + + T decoded_bbox_center_x = + prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width + + prior_bbox_center_x; + T decoded_bbox_center_y = + prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height + + prior_bbox_center_y; + T decoded_bbox_width = + std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width; + T decoded_bbox_height = + std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height; + + BBox decoded_bbox; + decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2; + decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2; + decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2; + decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2; + + return decoded_bbox; +} +template +bool sortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} +template +bool sortScorePairDescend(const std::pair>& pair1, + const std::pair>& pair2); +template +T jaccardOverlap(const BBox& bbox1, const BBox& bbox2) { + if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min || + bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) { + return 0.0; + } else { + T inter_x_min = std::max(bbox1.x_min, bbox2.x_min); + T inter_y_min = std::max(bbox1.y_min, bbox2.y_min); + T interX_max = std::min(bbox1.x_max, bbox2.x_max); + T interY_max = std::min(bbox1.y_max, bbox2.y_max); + + T inter_width = interX_max - inter_x_min; + T inter_height = interY_max - inter_y_min; + T inter_area = inter_width * inter_height; + + T bbox_area1 = bbox1.get_area(); + T bbox_area2 = bbox2.get_area(); + + return inter_area / (bbox_area1 + bbox_area2 - inter_area); + } +} + +template +void applyNMSFast(const std::vector>& bboxes, const T* conf_score_data, + size_t class_idx, size_t top_k, T conf_threshold, + T nms_threshold, size_t num_priors, size_t num_classes, + std::vector* indices) { + std::vector> scores; + for (size_t i = 0; i < num_priors; ++i) { + size_t conf_offset = i * num_classes + class_idx; + if (conf_score_data[conf_offset] > conf_threshold) + scores.push_back(std::make_pair(conf_score_data[conf_offset], i)); + } + std::stable_sort(scores.begin(), scores.end(), + sortScorePairDescend); + if (top_k > 0 && top_k < scores.size()) scores.resize(top_k); + while (scores.size() > 0) { + const size_t idx = scores.front().second; + bool keep = true; + for (size_t i = 0; i < indices->size(); ++i) { + if (keep) { + const size_t saved_idx = (*indices)[i]; + T overlap = jaccardOverlap(bboxes[idx], bboxes[saved_idx]); + keep = overlap <= nms_threshold; + } else { + break; + } + } + if (keep) indices->push_back(idx); + scores.erase(scores.begin()); + } +} +template +int getDetectionIndices( + const T* conf_data, const size_t num_priors, const size_t num_classes, + const size_t background_label_id, const size_t batch_size, + const T conf_threshold, const size_t nms_top_k, const T nms_threshold, + const size_t top_k, + const std::vector>>& all_decoded_bboxes, + std::vector>>* all_detection_indices) { + int total_keep_num = 0; + for (size_t n = 0; n < batch_size; ++n) { + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + size_t num_detected = 0; + std::map> indices; + size_t conf_offset = n * num_priors * num_classes; + for (size_t c = 0; c < num_classes; ++c) { + if (c == background_label_id) continue; + applyNMSFast(decoded_bboxes, conf_data + conf_offset, c, nms_top_k, + conf_threshold, nms_threshold, num_priors, num_classes, + &(indices[c])); + num_detected += indices[c].size(); + } + if (top_k > 0 && num_detected > top_k) { + // std::vector> score_index_pairs; + std::vector>> score_index_pairs; + for (size_t c = 0; c < num_classes; ++c) { + const std::vector& label_indices = indices[c]; + for (size_t i = 0; i < label_indices.size(); ++i) { + size_t idx = label_indices[i]; + score_index_pairs.push_back( + std::make_pair((conf_data + conf_offset)[idx * num_classes + c], + std::make_pair(c, idx))); + } + } + std::sort(score_index_pairs.begin(), score_index_pairs.end(), + sortScorePairDescend>); + score_index_pairs.resize(top_k); + std::map> new_indices; + for (size_t i = 0; i < score_index_pairs.size(); ++i) { + size_t label = score_index_pairs[i].second.first; + size_t idx = score_index_pairs[i].second.second; + new_indices[label].push_back(idx); + } + all_detection_indices->push_back(new_indices); + total_keep_num += top_k; + } else { + all_detection_indices->push_back(indices); + total_keep_num += num_detected; + } + } + return total_keep_num; +} +template +BBox clipBBox(const BBox& bbox) { + T one = static_cast(1.0); + T zero = static_cast(0.0); + BBox clipped_bbox; + clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero); + clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero); + clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero); + clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero); + return clipped_bbox; +} +template +void getDetectionOutput( + const T* conf_data, const size_t num_kept, const size_t num_priors, + const size_t num_classes, const size_t batch_size, + const std::vector>>& all_indices, + const std::vector>>& all_decoded_bboxes, T* out_data) { + size_t count = 0; + for (size_t n = 0; n < batch_size; ++n) { + for (std::map>::const_iterator it = + all_indices[n].begin(); + it != all_indices[n].end(); ++it) { + size_t label = it->first; + const std::vector& indices = it->second; + const std::vector>& decoded_bboxes = all_decoded_bboxes[n]; + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + size_t conf_offset = n * num_priors * num_classes + idx * num_classes; + out_data[count * 7] = n; + out_data[count * 7 + 1] = label; + out_data[count * 7 + 2] = (conf_data + conf_offset)[label]; + BBox clipped_bbox = clipBBox(decoded_bboxes[idx]); + out_data[count * 7 + 3] = clipped_bbox.x_min; + out_data[count * 7 + 4] = clipped_bbox.y_min; + out_data[count * 7 + 5] = clipped_bbox.x_max; + out_data[count * 7 + 6] = clipped_bbox.y_max; + ++count; + } + } + } + // out.copyFrom(out_data, num_kept * 7); +} +} // namespace math +} // namespace operators +} // namespace paddle -- GitLab