未验证 提交 9d2e0ecb 编写于 作者: T Thomas Young 提交者: GitHub

[operator migration] Migrate unstack_op and nms_op (#44424)

* update unstack_op

* update unstack_op

* update unstack_op

* fix unstack test

* update unstack

* update with remote

* fix unstack_test.py

* temp_save_change_nms_op

* add nms test

* update nms fix

* update unstack_op

* temp save change

* finish fix nms_op

* pass nms test

* fix CI

* fix ops test

* save change

* fix code style

* fix code style

* fix ci and codestyle

* fix ci
Co-authored-by: NShiningZhang <zhang_liang1991@126.com>
上级 74e46a93
......@@ -81,7 +81,7 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
sigmoid_focal_loss_op.cu)
detection_library(retinanet_detection_output_op SRCS
retinanet_detection_output_op.cc)
detection_library(nms_op SRCS nms_op.cc nms_op.cu)
detection_library(nms_op SRCS nms_op.cc)
if(WITH_GPU OR WITH_ROCM)
set(TMPDEPS memory)
......
......@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/nms_op.h"
#include <vector>
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
namespace operators {
......@@ -65,23 +69,6 @@ class NMSOpMaker : public framework::OpProtoAndCheckerMaker {
class NMSOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Boxes"), "Input", "Boxes", "NMS");
OP_INOUT_CHECK(
ctx->HasOutput("KeepBoxesIdxs"), "Output", "KeepBoxesIdxs", "NMS");
auto boxes_dim = ctx->GetInputDim("Boxes");
PADDLE_ENFORCE_EQ(boxes_dim.size(),
2,
platform::errors::InvalidArgument(
"The Input Boxes must be 2-dimention "
"whose shape must be [N, 4] "
"N is the number of boxes "
"in last dimension in format [x1, x2, y1, y2]. "));
auto num_boxes = boxes_dim[0];
ctx->SetOutputDim("KeepBoxesIdxs", {num_boxes});
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -92,64 +79,20 @@ class NMSOp : public framework::OperatorWithKernel {
};
template <typename T>
static void NMS(const T* boxes_data,
int64_t* output_data,
float threshold,
int64_t num_boxes) {
auto num_masks = CeilDivide(num_boxes, 64);
std::vector<uint64_t> masks(num_masks, 0);
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
T box_1[4];
for (int k = 0; k < 4; ++k) {
box_1[k] = boxes_data[i * 4 + k];
}
for (int64_t j = i + 1; j < num_boxes; ++j) {
if (masks[j / 64] & 1ULL << (j % 64)) continue;
T box_2[4];
for (int k = 0; k < 4; ++k) {
box_2[k] = boxes_data[j * 4 + k];
}
bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
if (is_overlap) {
masks[j / 64] |= 1ULL << (j % 64);
}
}
}
int64_t output_data_idx = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
output_data[output_data_idx++] = i;
}
for (; output_data_idx < num_boxes; ++output_data_idx) {
output_data[output_data_idx] = 0;
}
}
template <typename T>
class NMSKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* boxes = context.Input<Tensor>("Boxes");
Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
auto threshold = context.template Attr<float>("iou_threshold");
NMS<T>(boxes->data<T>(), output_data, threshold, boxes->dims()[0]);
}
};
class NMSKernel : public framework::OpKernel<T> {};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(nms,
NMSInferMetaFunctor,
PD_INFER_META(phi::NMSInferMeta));
REGISTER_OPERATOR(
nms,
ops::NMSOp,
ops::NMSOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(nms, ops::NMSKernel<float>, ops::NMSKernel<double>);
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
NMSInferMetaFunctor);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/detection/nms_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
namespace paddle {
namespace operators {
using framework::Tensor;
template <typename T>
static __global__ void NMS(const T* boxes_data,
float threshold,
int64_t num_boxes,
uint64_t* masks) {
auto raw_start = blockIdx.y;
auto col_start = blockIdx.x;
if (raw_start > col_start) return;
const int raw_last_storage =
min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
const int col_last_storage =
min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
if (threadIdx.x < raw_last_storage) {
uint64_t mask = 0;
auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
const T* current_box = boxes_data + current_box_idx * 4;
for (int i = 0; i < col_last_storage; ++i) {
const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
if (CalculateIoU<T>(current_box, target_box, threshold)) {
mask |= 1ULL << i;
}
}
const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
masks[current_box_idx * blocks_per_line + col_start] = mask;
}
}
template <typename T>
class NMSCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* boxes = context.Input<Tensor>("Boxes");
Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
auto* output_data = output->mutable_data<int64_t>(context.GetPlace());
auto threshold = context.template Attr<float>("iou_threshold");
const int64_t num_boxes = boxes->dims()[0];
const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data =
memory::Alloc(context.cuda_device_context(),
num_boxes * blocks_per_line * sizeof(uint64_t));
uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
NMS<T><<<grid, block, 0, context.cuda_device_context().stream()>>>(
boxes->data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
memory::Copy(platform::CPUPlace(),
mask_host.data(),
context.GetPlace(),
mask_dev,
num_boxes * blocks_per_line * sizeof(uint64_t),
context.cuda_device_context().stream());
std::vector<int64_t> remv(blocks_per_line);
std::vector<int64_t> keep_boxes_idxs(num_boxes);
int64_t* output_host = keep_boxes_idxs.data();
int64_t last_box_num = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
auto remv_element_id = i / threadsPerBlock;
auto remv_bit_id = i % threadsPerBlock;
if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
output_host[last_box_num++] = i;
uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
for (auto j = remv_element_id; j < blocks_per_line; ++j) {
remv[j] |= current_mask[j];
}
}
}
memory::Copy(context.GetPlace(),
output_data,
platform::CPUPlace(),
output_host,
sizeof(int64_t) * num_boxes,
context.cuda_device_context().stream());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(nms,
ops::NMSCudaKernel<float>,
ops::NMSCudaKernel<double>);
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/infermeta/unary.h"
namespace paddle {
......@@ -63,51 +64,6 @@ class UnStackGradOpMaker : public framework::SingleGradOpMaker<T> {
class UnStackGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(),
0,
platform::errors::InvalidArgument(
"The Inputs(Y@Grad) of unstack operator are empty."));
OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
"Output",
"X",
"UnStackGrad");
auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(
input_dims[i],
input_dims[0],
platform::errors::InvalidArgument(
"The dimensions of all Inputs(Y@Grad) must be the same,"
"but received Inputs(Y@Grad)'s %d-th dimension is %d, "
"Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
i,
input_dims[i],
i - 1,
input_dims[0]));
}
int axis = ctx->Attrs().Get<int>("axis");
int rank = input_dims[0].size();
PADDLE_ENFORCE_GE(axis,
-(rank + 1),
platform::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
PADDLE_ENFORCE_LT(axis,
rank + 1,
platform::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
if (axis < 0) axis += (rank + 1);
auto vec = phi::vectorize<int>(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
ctx->SetOutputDim(framework::GradVarName("X"), phi::make_ddim(vec));
}
};
} // namespace operators
......@@ -119,12 +75,15 @@ namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(unstack,
UnStackInferMetaFunctor,
PD_INFER_META(phi::UnStackInferMeta));
DECLARE_INFER_SHAPE_FUNCTOR(unstack_grad,
UnStackGradInferMetaFunctor,
PD_INFER_META(phi::UnStackGradInferMeta));
REGISTER_OPERATOR(unstack,
ops::UnStackOp,
ops::UnStackOpMaker,
ops::UnStackGradOpMaker<paddle::framework::OpDesc>,
ops::UnStackGradOpMaker<paddle::imperative::OpBase>,
UnStackInferMetaFunctor);
REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp);
REGISTER_OPERATOR(unstack_grad,
ops::UnStackGradOp,
UnStackGradInferMetaFunctor);
......@@ -889,7 +889,7 @@
func : FrameInferMeta
kernel :
func : frame
backward : frame_grad
backward : frame_grad
- api : frobenius_norm
args : (Tensor x, int64_t[] axis, bool keep_dim, bool reduce_all)
......@@ -1700,6 +1700,15 @@
optional : weight
backward : nll_loss_grad
- api : nms
args : (Tensor x, float threshold)
output : Tensor(out)
infer_meta :
func : NMSInferMeta
kernel :
func : nms
data_type : x
- api : norm
args : (Tensor x, int axis, float epsilon, bool is_test)
output : Tensor(out), Tensor(norm)
......@@ -2258,7 +2267,7 @@
kernel :
func : spectralnorm
data_type : weight
backward : spectral_norm_grad
backward : spectral_norm_grad
- api : split
args : (Tensor x, IntArray num_or_sections, Scalar(int) axis)
......@@ -2566,6 +2575,16 @@
intermediate : xshape
backward : unsqueeze_grad
# unstack
- api : unstack
args : (Tensor x, int axis, int num)
output : Tensor[]{num}
infer_meta :
func : UnStackInferMeta
kernel :
func : unstack
backward : unstack_grad
# viterbi_decode
- api : viterbi_decode
args : (Tensor input, Tensor transition, Tensor length, bool include_bos_eos_tag)
......@@ -2629,7 +2648,7 @@
kernel:
func: broadcast_tensors
backward: broadcast_tensors_grad
# dirichlet
- api: dirichlet
args: (Tensor alpha)
......
......@@ -2499,6 +2499,16 @@
inplace : (out_grad -> x_grad)
backward : unsqueeze_double_grad
- backward_api : unstack_grad
forward : unstack (Tensor x, int axis, int num) -> Tensor[](out)
args : (Tensor[] out_grad, int axis)
output : Tensor(x_grad)
infer_meta :
func : UnStackGradInferMeta
param : [out_grad, axis]
kernel :
func : unstack_grad
- backward_api : warpctc_grad
forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/infermeta/backward.h"
#include "paddle/phi/common/type_traits.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
......@@ -787,4 +786,46 @@ void StackGradInferMeta(const MetaTensor& out_grad,
}
}
void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
int axis,
MetaTensor* x_grad) {
std::vector<phi::DDim> input_dims(out_grad.size());
for (size_t i = 0; i < out_grad.size(); ++i) {
input_dims[i] = out_grad[i]->dims();
}
for (size_t i = 1; i < input_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(
input_dims[i],
input_dims[0],
phi::errors::InvalidArgument(
"The dimensions of all Inputs(Y@Grad) must be the same,"
"but received Inputs(Y@Grad)'s %d-th dimension is %d, "
"Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
i,
input_dims[i],
i - 1,
input_dims[0]));
}
int rank = input_dims[0].size();
PADDLE_ENFORCE_GE(axis,
-(rank + 1),
phi::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
PADDLE_ENFORCE_LT(axis,
rank + 1,
phi::errors::InvalidArgument(
"The attribute axis is out of range, it must be "
"inside [-(rank+1), rank+1), where rank = %d",
rank));
if (axis < 0) axis += (rank + 1);
auto vec = phi::vectorize<int>(input_dims[0]);
vec.insert(vec.begin() + axis, input_dims.size());
x_grad->set_dims(phi::make_ddim(vec));
x_grad->set_dtype(out_grad[0]->dtype());
}
} // namespace phi
......@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include <tuple>
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/infermeta/binary.h"
#include "paddle/phi/infermeta/multiary.h"
......@@ -325,4 +324,8 @@ void StackGradInferMeta(const MetaTensor& out_grad,
int axis,
std::vector<MetaTensor*> x_grad);
void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
int axis,
MetaTensor* x_grad);
} // namespace phi
......@@ -1707,6 +1707,19 @@ void NanmedianInferMeta(const MetaTensor& x,
out->set_dims(make_ddim(out_dim));
}
void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
auto boxes_dim = x.dims();
PADDLE_ENFORCE_EQ(boxes_dim.size(),
2,
phi::errors::InvalidArgument(
"The Input Boxes must be 2-dimention "
"whose shape must be [N, 4] "
"N is the number of boxes "
"in last dimension in format [x1, x2, y1, y2]. "));
auto num_boxes = boxes_dim[0];
out->set_dims(phi::make_ddim({num_boxes}));
}
void NormInferMeta(const MetaTensor& x,
int axis,
float epsilon,
......
......@@ -228,6 +228,8 @@ void NanmedianInferMeta(const MetaTensor& x,
MetaTensor* out,
MetaTensor* median_index);
void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out);
void NormInferMeta(const MetaTensor& x,
int axis,
float epsilon,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/diagonal.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace phi {
template <typename T>
static void NMS(const T* boxes_data,
int64_t* output_data,
float threshold,
int64_t num_boxes) {
auto num_masks = CeilDivide(num_boxes, 64);
std::vector<uint64_t> masks(num_masks, 0);
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
T box_1[4];
for (int k = 0; k < 4; ++k) {
box_1[k] = boxes_data[i * 4 + k];
}
for (int64_t j = i + 1; j < num_boxes; ++j) {
if (masks[j / 64] & 1ULL << (j % 64)) continue;
T box_2[4];
for (int k = 0; k < 4; ++k) {
box_2[k] = boxes_data[j * 4 + k];
}
bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
if (is_overlap) {
masks[j / 64] |= 1ULL << (j % 64);
}
}
}
int64_t output_data_idx = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
if (masks[i / 64] & 1ULL << (i % 64)) continue;
output_data[output_data_idx++] = i;
}
for (; output_data_idx < num_boxes; ++output_data_idx) {
output_data[output_data_idx] = 0;
}
}
template <typename T, typename Context>
void NMSKernel(const Context& dev_ctx,
const DenseTensor& boxes,
float threshold,
DenseTensor* output) {
auto output_data = dev_ctx.template Alloc<int64_t>(output);
NMS<T>(boxes.data<T>(), output_data, threshold, boxes.dims()[0]);
}
} // namespace phi
PD_REGISTER_KERNEL(nms, CPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/nms_kernel.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
namespace phi {
template <typename T>
static __global__ void NMS(const T* boxes_data,
float threshold,
int64_t num_boxes,
uint64_t* masks) {
auto raw_start = blockIdx.y;
auto col_start = blockIdx.x;
if (raw_start > col_start) return;
const int raw_last_storage =
min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
const int col_last_storage =
min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
if (threadIdx.x < raw_last_storage) {
uint64_t mask = 0;
auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
const T* current_box = boxes_data + current_box_idx * 4;
for (int i = 0; i < col_last_storage; ++i) {
const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
if (CalculateIoU<T>(current_box, target_box, threshold)) {
mask |= 1ULL << i;
}
}
const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
masks[current_box_idx * blocks_per_line + col_start] = mask;
}
}
template <typename T, typename Context>
void NMSKernel(const Context& dev_ctx,
const DenseTensor& boxes,
float threshold,
DenseTensor* output) {
auto* output_data = dev_ctx.template Alloc<int64_t>(output);
const int64_t num_boxes = boxes.dims()[0];
const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
dim3 block(threadsPerBlock);
dim3 grid(blocks_per_line, blocks_per_line);
auto mask_data = paddle::memory::Alloc(
dev_ctx, num_boxes * blocks_per_line * sizeof(uint64_t));
uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
boxes.data<T>(), threshold, num_boxes, mask_dev);
std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
paddle::memory::Copy(phi::CPUPlace(),
mask_host.data(),
dev_ctx.GetPlace(),
mask_dev,
num_boxes * blocks_per_line * sizeof(uint64_t),
dev_ctx.stream());
std::vector<int64_t> remv(blocks_per_line);
std::vector<int64_t> keep_boxes_idxs(num_boxes);
int64_t* output_host = keep_boxes_idxs.data();
int64_t last_box_num = 0;
for (int64_t i = 0; i < num_boxes; ++i) {
auto remv_element_id = i / threadsPerBlock;
auto remv_bit_id = i % threadsPerBlock;
if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
output_host[last_box_num++] = i;
uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
for (auto j = remv_element_id; j < blocks_per_line; ++j) {
remv[j] |= current_mask[j];
}
}
}
paddle::memory::Copy(dev_ctx.GetPlace(),
output_data,
phi::CPUPlace(),
output_host,
sizeof(int64_t) * num_boxes,
dev_ctx.stream());
}
} // namespace phi
PD_REGISTER_KERNEL(nms, GPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/hostdevice.h"
namespace paddle {
namespace operators {
namespace phi {
HOSTDEVICE static inline int64_t CeilDivide(int64_t n, int64_t m) {
return (n + m - 1) / m;
......@@ -48,5 +47,10 @@ HOSTDEVICE inline bool CalculateIoU(const T* const box_1,
return inter_area / union_area > threshold;
}
} // namespace operators
} // namespace paddle
template <typename T, typename Context>
void NMSKernel(const Context& dev_ctx,
const DenseTensor& boxes,
float threshold,
DenseTensor* output);
} // namespace phi
......@@ -10685,6 +10685,7 @@ def unstack(x, axis=0, num=None):
y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5]
"""
if _non_static_mode():
if num == None:
num = x.shape[axis]
......
......@@ -15,6 +15,7 @@
import unittest
import numpy as np
from op_test import OpTest
import paddle
def iou(box_a, box_b):
......@@ -71,22 +72,25 @@ class TestNMSOp(OpTest):
def setUp(self):
self.op_type = 'nms'
self.python_api = paddle.vision.ops.nms
self.dtype = np.float64
self.init_dtype_type()
boxes = np.random.rand(32, 4).astype(self.dtype)
boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
paddle.disable_static()
self.inputs = {'Boxes': boxes}
self.attrs = {'iou_threshold': 0.5}
out_py = nms(boxes, self.attrs['iou_threshold'])
self.outputs = {'KeepBoxesIdxs': out_py}
paddle.enable_static()
def init_dtype_type(self):
pass
def test_check_output(self):
self.check_output()
self.check_output(check_eager=True)
if __name__ == "__main__":
......
......@@ -15,6 +15,7 @@
from op_test import OpTest
import numpy as np
import unittest
import paddle
class TestUnStackOpBase(OpTest):
......@@ -37,6 +38,7 @@ class TestUnStackOpBase(OpTest):
self.initDefaultParameters()
self.initParameters()
self.op_type = 'unstack'
self.python_api = paddle.unstack
self.x = np.random.random(size=self.input_dim).astype(self.dtype)
outs = np.split(self.x, self.input_dim[self.axis], self.axis)
......@@ -44,18 +46,21 @@ class TestUnStackOpBase(OpTest):
del new_shape[self.axis]
y_names = self.get_y_names()
tmp = []
tmp_names = []
for i in range(self.input_dim[self.axis]):
tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
tmp_names.append(y_names[i])
self.python_out_sig = tmp_names
self.inputs = {'X': self.x}
self.outputs = {'Y': tmp}
self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
def test_check_output(self):
self.check_output()
self.check_output(check_eager=True)
def test_check_grad(self):
self.check_grad(['X'], self.get_y_names())
self.check_grad(['X'], self.get_y_names(), check_eager=True)
class TestStackOp3(TestUnStackOpBase):
......
......@@ -454,6 +454,13 @@ def unstack(x, axis=0, num=None):
y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5]
"""
if in_dygraph_mode():
if num == None:
num = x.shape[axis]
if num == 0:
return []
return _C_ops.final_state_unstack(x, axis, num)
if _non_static_mode():
if num == None:
num = x.shape[axis]
......
......@@ -1579,6 +1579,9 @@ def nms(boxes,
"""
def _nms(boxes, iou_threshold):
if in_dygraph_mode():
return _C_ops.final_state_nms(boxes, iou_threshold)
if _non_static_mode():
return _C_ops.nms(boxes, 'iou_threshold', iou_threshold)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册