未验证 提交 9d2e0ecb 编写于 作者: T Thomas Young 提交者: GitHub

[operator migration] Migrate unstack_op and nms_op (#44424)

* update unstack_op

* update unstack_op

* update unstack_op

* fix unstack test

* update unstack

* update with remote

* fix unstack_test.py

* temp_save_change_nms_op

* add nms test

* update nms fix

* update unstack_op

* temp save change

* finish fix nms_op

* pass nms test

* fix CI

* fix ops test

* save change

* fix code style

* fix code style

* fix ci and codestyle

* fix ci
Co-authored-by: NShiningZhang <zhang_liang1991@126.com>
上级 74e46a93
...@@ -81,7 +81,7 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc ...@@ -81,7 +81,7 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
sigmoid_focal_loss_op.cu) sigmoid_focal_loss_op.cu)
detection_library(retinanet_detection_output_op SRCS detection_library(retinanet_detection_output_op SRCS
retinanet_detection_output_op.cc) retinanet_detection_output_op.cc)
detection_library(nms_op SRCS nms_op.cc nms_op.cu) detection_library(nms_op SRCS nms_op.cc)
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
set(TMPDEPS memory) set(TMPDEPS memory)
......
...@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection/nms_op.h"
#include <vector> #include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -65,23 +69,6 @@ class NMSOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,23 +69,6 @@ class NMSOpMaker : public framework::OpProtoAndCheckerMaker {
class NMSOp : public framework::OperatorWithKernel { class NMSOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Boxes"), "Input", "Boxes", "NMS");
OP_INOUT_CHECK(
ctx->HasOutput("KeepBoxesIdxs"), "Output", "KeepBoxesIdxs", "NMS");
auto boxes_dim = ctx->GetInputDim("Boxes");
PADDLE_ENFORCE_EQ(boxes_dim.size(),
2,
platform::errors::InvalidArgument(
"The Input Boxes must be 2-dimention "
"whose shape must be [N, 4] "
"N is the number of boxes "
"in last dimension in format [x1, x2, y1, y2]. "));
auto num_boxes = boxes_dim[0];
ctx->SetOutputDim("KeepBoxesIdxs", {num_boxes});
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
...@@ -92,64 +79,20 @@ class NMSOp : public framework::OperatorWithKernel { ...@@ -92,64 +79,20 @@ class NMSOp : public framework::OperatorWithKernel {
}; };
template <typename T> template <typename T>
static void NMS(const T* boxes_data, class NMSKernel : public framework::OpKernel<T> {};
int64_t* output_data,
float threshold,
int64_t num_boxes) {
auto num_masks = CeilDivide(num_boxes, 64);
std::vector<uint64_t> masks(num_masks, 0);
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
T box_1[4];
for (int k = 0; k < 4; ++k) {
box_1[k] = boxes_data[i * 4 + k];
}
for (int64_t j = i + 1; j < num_boxes; ++j) {
if (masks[j / 64] & 1ULL << (j % 64)) continue;
T box_2[4];
for (int k = 0; k < 4; ++k) {
box_2[k] = boxes_data[j * 4 + k];
}
bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
if (is_overlap) {
masks[j / 64] |= 1ULL << (j % 64);
}
}
}
int64_t output_data_idx = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
output_data[output_data_idx++] = i;
}
for (; output_data_idx < num_boxes; ++output_data_idx) {
output_data[output_data_idx] = 0;
}
}
template <typename T>
class NMSKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* boxes = context.Input<Tensor>("Boxes");
Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
auto threshold = context.template Attr<float>("iou_threshold");
NMS<T>(boxes->data<T>(), output_data, threshold, boxes->dims()[0]);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(nms,
NMSInferMetaFunctor,
PD_INFER_META(phi::NMSInferMeta));
REGISTER_OPERATOR( REGISTER_OPERATOR(
nms, nms,
ops::NMSOp, ops::NMSOp,
ops::NMSOpMaker, ops::NMSOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
REGISTER_OP_CPU_KERNEL(nms, ops::NMSKernel<float>, ops::NMSKernel<double>); NMSInferMetaFunctor);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/detection/nms_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
namespace paddle {
namespace operators {
using framework::Tensor;
template <typename T>
static __global__ void NMS(const T* boxes_data,
float threshold,
int64_t num_boxes,
uint64_t* masks) {
auto raw_start = blockIdx.y;
auto col_start = blockIdx.x;
if (raw_start > col_start) return;
const int raw_last_storage =
min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
const int col_last_storage =
min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
if (threadIdx.x < raw_last_storage) {
uint64_t mask = 0;
auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
const T* current_box = boxes_data + current_box_idx * 4;
for (int i = 0; i < col_last_storage; ++i) {
const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
if (CalculateIoU<T>(current_box, target_box, threshold)) {
mask |= 1ULL << i;
}
}
const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
masks[current_box_idx * blocks_per_line + col_start] = mask;
}
}
template <typename T>
class NMSCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* boxes = context.Input<Tensor>("Boxes");
Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
auto* output_data = output->mutable_data<int64_t>(context.GetPlace());
auto threshold = context.template Attr<float>("iou_threshold");
const int64_t num_boxes = boxes->dims()[0];
const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data =
memory::Alloc(context.cuda_device_context(),
num_boxes * blocks_per_line * sizeof(uint64_t));
uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
NMS<T><<<grid, block, 0, context.cuda_device_context().stream()>>>(
boxes->data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
memory::Copy(platform::CPUPlace(),
mask_host.data(),
context.GetPlace(),
mask_dev,
num_boxes * blocks_per_line * sizeof(uint64_t),
context.cuda_device_context().stream());
std::vector<int64_t> remv(blocks_per_line);
std::vector<int64_t> keep_boxes_idxs(num_boxes);
int64_t* output_host = keep_boxes_idxs.data();
int64_t last_box_num = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
auto remv_element_id = i / threadsPerBlock;
auto remv_bit_id = i % threadsPerBlock;
if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
output_host[last_box_num++] = i;
uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
for (auto j = remv_element_id; j < blocks_per_line; ++j) {
remv[j] |= current_mask[j];
}
}
}
memory::Copy(context.GetPlace(),
output_data,
platform::CPUPlace(),
output_host,
sizeof(int64_t) * num_boxes,
context.cuda_device_context().stream());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(nms,
ops::NMSCudaKernel<float>,
ops::NMSCudaKernel<double>);
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/infermeta/unary.h"
namespace paddle { namespace paddle {
...@@ -63,51 +64,6 @@ class UnStackGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -63,51 +64,6 @@ class UnStackGradOpMaker : public framework::SingleGradOpMaker<T> {
class UnStackGradOp : public framework::OperatorWithKernel { class UnStackGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(),
0,
platform::errors::InvalidArgument(
"The Inputs(Y@Grad) of unstack operator are empty."));
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
"Output",
"X",
"UnStackGrad");
auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(
input_dims[i],
input_dims[0],
platform::errors::InvalidArgument(
"The dimensions of all Inputs(Y@Grad) must be the same,"
"but received Inputs(Y@Grad)'s %d-th dimension is %d, "
"Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
i,
input_dims[i],
i - 1,
input_dims[0]));
}
int axis = ctx->Attrs().Get<int>("axis");
int rank = input_dims[0].size();
PADDLE_ENFORCE_GE(axis,
-(rank + 1),
platform::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
PADDLE_ENFORCE_LT(axis,
rank + 1,
platform::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
if (axis < 0) axis += (rank + 1);
auto vec = phi::vectorize<int>(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), phi::make_ddim(vec));
}
}; };
} // namespace operators } // namespace operators
...@@ -119,12 +75,15 @@ namespace ops = paddle::operators; ...@@ -119,12 +75,15 @@ namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(unstack, DECLARE_INFER_SHAPE_FUNCTOR(unstack,
UnStackInferMetaFunctor, UnStackInferMetaFunctor,
PD_INFER_META(phi::UnStackInferMeta)); PD_INFER_META(phi::UnStackInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(unstack_grad,
UnStackGradInferMetaFunctor,
PD_INFER_META(phi::UnStackGradInferMeta));
REGISTER_OPERATOR(unstack, REGISTER_OPERATOR(unstack,
ops::UnStackOp, ops::UnStackOp,
ops::UnStackOpMaker, ops::UnStackOpMaker,
ops::UnStackGradOpMaker<paddle::framework::OpDesc>, ops::UnStackGradOpMaker<paddle::framework::OpDesc>,
ops::UnStackGradOpMaker<paddle::imperative::OpBase>, ops::UnStackGradOpMaker<paddle::imperative::OpBase>,
UnStackInferMetaFunctor); UnStackInferMetaFunctor);
REGISTER_OPERATOR(unstack_grad,
REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp); ops::UnStackGradOp,
UnStackGradInferMetaFunctor);
...@@ -889,7 +889,7 @@ ...@@ -889,7 +889,7 @@
func : FrameInferMeta func : FrameInferMeta
kernel : kernel :
func : frame func : frame
backward : frame_grad backward : frame_grad
- api : frobenius_norm - api : frobenius_norm
args : (Tensor x, int64_t[] axis, bool keep_dim, bool reduce_all) args : (Tensor x, int64_t[] axis, bool keep_dim, bool reduce_all)
...@@ -1700,6 +1700,15 @@ ...@@ -1700,6 +1700,15 @@
optional : weight optional : weight
backward : nll_loss_grad backward : nll_loss_grad
- api : nms
args : (Tensor x, float threshold)
output : Tensor(out)
infer_meta :
func : NMSInferMeta
kernel :
func : nms
data_type : x
- api : norm - api : norm
args : (Tensor x, int axis, float epsilon, bool is_test) args : (Tensor x, int axis, float epsilon, bool is_test)
output : Tensor(out), Tensor(norm) output : Tensor(out), Tensor(norm)
...@@ -2258,7 +2267,7 @@ ...@@ -2258,7 +2267,7 @@
kernel : kernel :
func : spectralnorm func : spectralnorm
data_type : weight data_type : weight
backward : spectral_norm_grad backward : spectral_norm_grad
- api : split - api : split
args : (Tensor x, IntArray num_or_sections, Scalar(int) axis) args : (Tensor x, IntArray num_or_sections, Scalar(int) axis)
...@@ -2566,6 +2575,16 @@ ...@@ -2566,6 +2575,16 @@
intermediate : xshape intermediate : xshape
backward : unsqueeze_grad backward : unsqueeze_grad
# unstack
- api : unstack
args : (Tensor x, int axis, int num)
output : Tensor[]{num}
infer_meta :
func : UnStackInferMeta
kernel :
func : unstack
backward : unstack_grad
# viterbi_decode # viterbi_decode
- api : viterbi_decode - api : viterbi_decode
args : (Tensor input, Tensor transition, Tensor length, bool include_bos_eos_tag) args : (Tensor input, Tensor transition, Tensor length, bool include_bos_eos_tag)
...@@ -2629,7 +2648,7 @@ ...@@ -2629,7 +2648,7 @@
kernel: kernel:
func: broadcast_tensors func: broadcast_tensors
backward: broadcast_tensors_grad backward: broadcast_tensors_grad
# dirichlet # dirichlet
- api: dirichlet - api: dirichlet
args: (Tensor alpha) args: (Tensor alpha)
......
...@@ -2499,6 +2499,16 @@ ...@@ -2499,6 +2499,16 @@
inplace : (out_grad -> x_grad) inplace : (out_grad -> x_grad)
backward : unsqueeze_double_grad backward : unsqueeze_double_grad
- backward_api : unstack_grad
forward : unstack (Tensor x, int axis, int num) -> Tensor[](out)
args : (Tensor[] out_grad, int axis)
output : Tensor(x_grad)
infer_meta :
func : UnStackGradInferMeta
param : [out_grad, axis]
kernel :
func : unstack_grad
- backward_api : warpctc_grad - backward_api : warpctc_grad
forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad) forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times) args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/common/type_traits.h" #include "paddle/phi/common/type_traits.h"
#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h"
...@@ -787,4 +786,46 @@ void StackGradInferMeta(const MetaTensor& out_grad, ...@@ -787,4 +786,46 @@ void StackGradInferMeta(const MetaTensor& out_grad,
} }
} }
void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
int axis,
MetaTensor* x_grad) {
std::vector<phi::DDim> input_dims(out_grad.size());
for (size_t i = 0; i < out_grad.size(); ++i) {
input_dims[i] = out_grad[i]->dims();
}
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(
input_dims[i],
input_dims[0],
phi::errors::InvalidArgument(
"The dimensions of all Inputs(Y@Grad) must be the same,"
"but received Inputs(Y@Grad)'s %d-th dimension is %d, "
"Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
i,
input_dims[i],
i - 1,
input_dims[0]));
}
int rank = input_dims[0].size();
PADDLE_ENFORCE_GE(axis,
-(rank + 1),
phi::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
PADDLE_ENFORCE_LT(axis,
rank + 1,
phi::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
if (axis < 0) axis += (rank + 1);
auto vec = phi::vectorize<int>(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
x_grad->set_dims(phi::make_ddim(vec));
x_grad->set_dtype(out_grad[0]->dtype());
}
} // namespace phi } // namespace phi
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once #pragma once
#include <tuple> #include <tuple>
#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/infermeta/binary.h" #include "paddle/phi/infermeta/binary.h"
#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/multiary.h"
...@@ -325,4 +324,8 @@ void StackGradInferMeta(const MetaTensor& out_grad, ...@@ -325,4 +324,8 @@ void StackGradInferMeta(const MetaTensor& out_grad,
int axis, int axis,
std::vector<MetaTensor*> x_grad); std::vector<MetaTensor*> x_grad);
void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
int axis,
MetaTensor* x_grad);
} // namespace phi } // namespace phi
...@@ -1707,6 +1707,19 @@ void NanmedianInferMeta(const MetaTensor& x, ...@@ -1707,6 +1707,19 @@ void NanmedianInferMeta(const MetaTensor& x,
out->set_dims(make_ddim(out_dim)); out->set_dims(make_ddim(out_dim));
} }
void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
auto boxes_dim = x.dims();
PADDLE_ENFORCE_EQ(boxes_dim.size(),
2,
phi::errors::InvalidArgument(
"The Input Boxes must be 2-dimention "
"whose shape must be [N, 4] "
"N is the number of boxes "
"in last dimension in format [x1, x2, y1, y2]. "));
auto num_boxes = boxes_dim[0];
out->set_dims(phi::make_ddim({num_boxes}));
}
void NormInferMeta(const MetaTensor& x, void NormInferMeta(const MetaTensor& x,
int axis, int axis,
float epsilon, float epsilon,
......
...@@ -228,6 +228,8 @@ void NanmedianInferMeta(const MetaTensor& x, ...@@ -228,6 +228,8 @@ void NanmedianInferMeta(const MetaTensor& x,
MetaTensor* out, MetaTensor* out,
MetaTensor* median_index); MetaTensor* median_index);
void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out);
void NormInferMeta(const MetaTensor& x, void NormInferMeta(const MetaTensor& x,
int axis, int axis,
float epsilon, float epsilon,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/diagonal.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace phi {
template <typename T>
static void NMS(const T* boxes_data,
int64_t* output_data,
float threshold,
int64_t num_boxes) {
auto num_masks = CeilDivide(num_boxes, 64);
std::vector<uint64_t> masks(num_masks, 0);
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
T box_1[4];
for (int k = 0; k < 4; ++k) {
box_1[k] = boxes_data[i * 4 + k];
}
for (int64_t j = i + 1; j < num_boxes; ++j) {
if (masks[j / 64] & 1ULL << (j % 64)) continue;
T box_2[4];
for (int k = 0; k < 4; ++k) {
box_2[k] = boxes_data[j * 4 + k];
}
bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
if (is_overlap) {
masks[j / 64] |= 1ULL << (j % 64);
}
}
}
int64_t output_data_idx = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
output_data[output_data_idx++] = i;
}
for (; output_data_idx < num_boxes; ++output_data_idx) {
output_data[output_data_idx] = 0;
}
}
template <typename T, typename Context>
void NMSKernel(const Context& dev_ctx,
const DenseTensor& boxes,
float threshold,
DenseTensor* output) {
auto output_data = dev_ctx.template Alloc<int64_t>(output);
NMS<T>(boxes.data<T>(), output_data, threshold, boxes.dims()[0]);
}
} // namespace phi
PD_REGISTER_KERNEL(nms, CPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
namespace phi {
template <typename T>
static __global__ void NMS(const T* boxes_data,
float threshold,
int64_t num_boxes,
uint64_t* masks) {
auto raw_start = blockIdx.y;
auto col_start = blockIdx.x;
if (raw_start > col_start) return;
const int raw_last_storage =
min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
const int col_last_storage =
min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
if (threadIdx.x < raw_last_storage) {
uint64_t mask = 0;
auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
const T* current_box = boxes_data + current_box_idx * 4;
for (int i = 0; i < col_last_storage; ++i) {
const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
if (CalculateIoU<T>(current_box, target_box, threshold)) {
mask |= 1ULL << i;
}
}
const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
masks[current_box_idx * blocks_per_line + col_start] = mask;
}
}
template <typename T, typename Context>
void NMSKernel(const Context& dev_ctx,
const DenseTensor& boxes,
float threshold,
DenseTensor* output) {
auto* output_data = dev_ctx.template Alloc<int64_t>(output);
const int64_t num_boxes = boxes.dims()[0];
const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data = paddle::memory::Alloc(
dev_ctx, num_boxes * blocks_per_line * sizeof(uint64_t));
uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
boxes.data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
paddle::memory::Copy(phi::CPUPlace(),
mask_host.data(),
dev_ctx.GetPlace(),
mask_dev,
num_boxes * blocks_per_line * sizeof(uint64_t),
dev_ctx.stream());
std::vector<int64_t> remv(blocks_per_line);
std::vector<int64_t> keep_boxes_idxs(num_boxes);
int64_t* output_host = keep_boxes_idxs.data();
int64_t last_box_num = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
auto remv_element_id = i / threadsPerBlock;
auto remv_bit_id = i % threadsPerBlock;
if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
output_host[last_box_num++] = i;
uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
for (auto j = remv_element_id; j < blocks_per_line; ++j) {
remv[j] |= current_mask[j];
}
}
}
paddle::memory::Copy(dev_ctx.GetPlace(),
output_data,
phi::CPUPlace(),
output_host,
sizeof(int64_t) * num_boxes,
dev_ctx.stream());
}
} // namespace phi
PD_REGISTER_KERNEL(nms, GPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
You may obtain a copy of the License at // You may obtain a copy of the License at
//
http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
//
Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
limitations under the License. */ // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/phi/core/hostdevice.h"
namespace paddle { namespace phi {
namespace operators {
HOSTDEVICE static inline int64_t CeilDivide(int64_t n, int64_t m) { HOSTDEVICE static inline int64_t CeilDivide(int64_t n, int64_t m) {
return (n + m - 1) / m; return (n + m - 1) / m;
...@@ -48,5 +47,10 @@ HOSTDEVICE inline bool CalculateIoU(const T* const box_1, ...@@ -48,5 +47,10 @@ HOSTDEVICE inline bool CalculateIoU(const T* const box_1,
return inter_area / union_area > threshold; return inter_area / union_area > threshold;
} }
} // namespace operators template <typename T, typename Context>
} // namespace paddle void NMSKernel(const Context& dev_ctx,
const DenseTensor& boxes,
float threshold,
DenseTensor* output);
} // namespace phi
...@@ -10685,6 +10685,7 @@ def unstack(x, axis=0, num=None): ...@@ -10685,6 +10685,7 @@ def unstack(x, axis=0, num=None):
y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5] y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5]
""" """
if _non_static_mode(): if _non_static_mode():
if num == None: if num == None:
num = x.shape[axis] num = x.shape[axis]
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle
def iou(box_a, box_b): def iou(box_a, box_b):
...@@ -71,22 +72,25 @@ class TestNMSOp(OpTest): ...@@ -71,22 +72,25 @@ class TestNMSOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = 'nms' self.op_type = 'nms'
self.python_api = paddle.vision.ops.nms
self.dtype = np.float64 self.dtype = np.float64
self.init_dtype_type() self.init_dtype_type()
boxes = np.random.rand(32, 4).astype(self.dtype) boxes = np.random.rand(32, 4).astype(self.dtype)
boxes[:, 2] = boxes[:, 0] + boxes[:, 2] boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
boxes[:, 3] = boxes[:, 1] + boxes[:, 3] boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
paddle.disable_static()
self.inputs = {'Boxes': boxes} self.inputs = {'Boxes': boxes}
self.attrs = {'iou_threshold': 0.5} self.attrs = {'iou_threshold': 0.5}
out_py = nms(boxes, self.attrs['iou_threshold']) out_py = nms(boxes, self.attrs['iou_threshold'])
self.outputs = {'KeepBoxesIdxs': out_py} self.outputs = {'KeepBoxesIdxs': out_py}
paddle.enable_static()
def init_dtype_type(self): def init_dtype_type(self):
pass pass
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(check_eager=True)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from op_test import OpTest from op_test import OpTest
import numpy as np import numpy as np
import unittest import unittest
import paddle
class TestUnStackOpBase(OpTest): class TestUnStackOpBase(OpTest):
...@@ -37,6 +38,7 @@ class TestUnStackOpBase(OpTest): ...@@ -37,6 +38,7 @@ class TestUnStackOpBase(OpTest):
self.initDefaultParameters() self.initDefaultParameters()
self.initParameters() self.initParameters()
self.op_type = 'unstack' self.op_type = 'unstack'
self.python_api = paddle.unstack
self.x = np.random.random(size=self.input_dim).astype(self.dtype) self.x = np.random.random(size=self.input_dim).astype(self.dtype)
outs = np.split(self.x, self.input_dim[self.axis], self.axis) outs = np.split(self.x, self.input_dim[self.axis], self.axis)
...@@ -44,18 +46,21 @@ class TestUnStackOpBase(OpTest): ...@@ -44,18 +46,21 @@ class TestUnStackOpBase(OpTest):
del new_shape[self.axis] del new_shape[self.axis]
y_names = self.get_y_names() y_names = self.get_y_names()
tmp = [] tmp = []
tmp_names = []
for i in range(self.input_dim[self.axis]): for i in range(self.input_dim[self.axis]):
tmp.append((y_names[i], np.reshape(outs[i], new_shape))) tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
tmp_names.append(y_names[i])
self.python_out_sig = tmp_names
self.inputs = {'X': self.x} self.inputs = {'X': self.x}
self.outputs = {'Y': tmp} self.outputs = {'Y': tmp}
self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]} self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(check_eager=True)
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], self.get_y_names()) self.check_grad(['X'], self.get_y_names(), check_eager=True)
class TestStackOp3(TestUnStackOpBase): class TestStackOp3(TestUnStackOpBase):
......
...@@ -454,6 +454,13 @@ def unstack(x, axis=0, num=None): ...@@ -454,6 +454,13 @@ def unstack(x, axis=0, num=None):
y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5] y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5]
""" """
if in_dygraph_mode():
if num == None:
num = x.shape[axis]
if num == 0:
return []
return _C_ops.final_state_unstack(x, axis, num)
if _non_static_mode(): if _non_static_mode():
if num == None: if num == None:
num = x.shape[axis] num = x.shape[axis]
......
...@@ -1579,6 +1579,9 @@ def nms(boxes, ...@@ -1579,6 +1579,9 @@ def nms(boxes,
""" """
def _nms(boxes, iou_threshold): def _nms(boxes, iou_threshold):
if in_dygraph_mode():
return _C_ops.final_state_nms(boxes, iou_threshold)
if _non_static_mode(): if _non_static_mode():
return _C_ops.nms(boxes, 'iou_threshold', iou_threshold) return _C_ops.nms(boxes, 'iou_threshold', iou_threshold)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册