未验证 提交 ba992136 编写于 作者: L leolishaohao 提交者: GitHub

[XPU] register multiclass_nms3 and norm xpu kernel to optimize model (#56064)

上级 c271f26b
...@@ -542,6 +542,7 @@ XPUOpMap& get_kl2_ops() { ...@@ -542,6 +542,7 @@ XPUOpMap& get_kl2_ops() {
phi::DataType::INT64})}, phi::DataType::INT64})},
{"multi_encoder_xpu", {"multi_encoder_xpu",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"multiclass_nms3", XPUKernelSet({phi::DataType::FLOAT32})},
{"nearest_interp_v2", {"nearest_interp_v2",
XPUKernelSet({phi::DataType::FLOAT32, XPUKernelSet({phi::DataType::FLOAT32,
phi::DataType::FLOAT16, phi::DataType::FLOAT16,
...@@ -549,6 +550,7 @@ XPUOpMap& get_kl2_ops() { ...@@ -549,6 +550,7 @@ XPUOpMap& get_kl2_ops() {
{"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"nll_loss", XPUKernelSet({phi::DataType::FLOAT32})}, {"nll_loss", XPUKernelSet({phi::DataType::FLOAT32})},
{"nll_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"nll_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"norm", XPUKernelSet({phi::DataType::FLOAT32})},
{"not_equal", {"not_equal",
XPUKernelSet({phi::DataType::INT64, XPUKernelSet({phi::DataType::INT64,
phi::DataType::INT32, phi::DataType::INT32,
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/multiclass_nms3_kernel.h"
#include <vector>
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void MultiClassNMSKernel(const Context& ctx,
const DenseTensor& bboxes,
const DenseTensor& scores,
const paddle::optional<DenseTensor>& rois_num,
float score_threshold,
int nums_top_k,
int keep_top_k,
float nms_threshold,
bool normalized,
float nms_eta,
int background_label,
DenseTensor* out,
DenseTensor* index,
DenseTensor* nms_rois_num) {
using XPUT = typename XPUTypeTrait<T>::Type;
const XPUT* bboxes_data = reinterpret_cast<const XPUT*>(bboxes.data<T>());
const XPUT* scores_data = reinterpret_cast<const XPUT*>(scores.data<T>());
bool return_index = index != nullptr;
bool has_rois_num = rois_num.get_ptr() != nullptr;
bool return_rois_num = nms_rois_num != nullptr;
auto score_dims = phi::vectorize<int>(scores.dims());
auto score_size = score_dims.size();
bool is_lod = score_size == 2 ? true : false;
int n = 0;
int b = 0;
int class_num = scores.dims()[1];
int out_dim = bboxes.dims()[2] + 2;
int boxes_count = 0;
std::vector<int> rois_num_vec;
rois_num_vec.clear();
if (is_lod) {
if (has_rois_num) {
n = rois_num.get_ptr()->numel();
for (int i = 0; i < n; i++) {
rois_num_vec.push_back(rois_num.get_ptr()->data<int>()[i]);
boxes_count += rois_num.get_ptr()->data<int>()[i];
}
} else {
auto lod = bboxes.lod().back();
boxes_count = lod[lod.size() - 1];
n = lod.size() - 1;
for (int i = 0; i < n; i++) {
rois_num_vec.push_back(lod[i + 1] - lod[i]);
}
}
PADDLE_ENFORCE_EQ(boxes_count == bboxes.dims()[0],
true,
phi::errors::InvalidArgument(
"boxes_count should equal boxes->dims()[0].",
"But received: (%d) and (%d)",
boxes_count,
bboxes.dims()[0]));
PADDLE_ENFORCE_EQ(
boxes_count == score_dims[0],
true,
phi::errors::InvalidArgument("boxes_count shuold equal score_dims[0].",
"But received: (%d) and (%d)",
boxes_count,
score_dims[0]));
} else {
n = bboxes.dims()[0];
b = bboxes.dims()[1];
boxes_count = n * b;
}
std::vector<T> outs_vec_;
std::vector<int> out_index_vec_;
outs_vec_.resize(boxes_count * out_dim);
out_index_vec_.resize(boxes_count);
std::vector<size_t> batch_starts;
int r = 0;
r = xpu::multiclass_nms<T, int>(ctx.x_context(),
bboxes_data,
scores_data,
rois_num_vec,
outs_vec_,
out_index_vec_,
batch_starts,
n,
b,
class_num,
out_dim,
nums_top_k,
score_threshold,
keep_top_k,
nms_threshold,
background_label,
normalized,
nms_eta,
return_index,
is_lod);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "multiclass_nms");
uint64_t num_kept = batch_starts.back();
if (num_kept == 0) {
if (return_index) {
// out_dim may be zero when there is no object in picture, so add some
// zeros to it
// caution: results may differ between cpu and xpu due to this operation
out->Resize({1, out_dim});
ctx.template Alloc<T>(out);
T* out_ptr = out->template data<T>();
std::vector<T> temp_value(out_dim, 0.0f);
memory_utils::Copy(ctx.GetPlace(),
out_ptr,
phi::CPUPlace(),
temp_value.data(),
1 * out_dim * sizeof(T));
index->Resize({1, 1});
ctx.template Alloc<int>(index);
int* out_index_ptr = index->template data<int>();
std::vector<int> temp_idx(1, 0);
memory_utils::Copy(ctx.GetPlace(),
out_index_ptr,
phi::CPUPlace(),
temp_idx.data(),
1 * sizeof(int));
} else {
out->Resize({1, 1});
T* od = ctx.template Alloc<T>(out);
od[0] = -1;
batch_starts = {0, 1};
}
} else {
out->Resize({static_cast<int64_t>(num_kept), out_dim});
ctx.template Alloc<T>(out);
T* out_ptr = out->template data<T>();
memory_utils::Copy(ctx.GetPlace(),
out_ptr,
phi::CPUPlace(),
outs_vec_.data(),
num_kept * out_dim * sizeof(T));
if (return_index) {
index->Resize({static_cast<int64_t>(num_kept), 1});
ctx.template Alloc<int>(index);
int* out_index_ptr = index->template data<int>();
memory_utils::Copy(ctx.GetPlace(),
out_index_ptr,
phi::CPUPlace(),
out_index_vec_.data(),
num_kept * sizeof(int));
}
}
if (return_rois_num) {
nms_rois_num->Resize({n});
ctx.template Alloc<int>(nms_rois_num);
DenseTensor nms_rois_num_cpu;
nms_rois_num_cpu.Resize({nms_rois_num->numel()});
ctx.template HostAlloc<int>(&nms_rois_num_cpu);
int* nms_rois_num_cpu_data = nms_rois_num_cpu.data<int>();
for (int i = 1; i <= n; i++) {
nms_rois_num_cpu_data[i - 1] = batch_starts[i] - batch_starts[i - 1];
}
phi::Copy(ctx, nms_rois_num_cpu, nms_rois_num->place(), true, nms_rois_num);
}
LoD lod;
if (num_kept == 0) {
batch_starts[batch_starts.size() - 1] = 1;
}
lod.emplace_back(batch_starts);
if (return_index) {
index->set_lod(lod);
}
out->set_lod(lod);
}
} // namespace phi
PD_REGISTER_KERNEL(
multiclass_nms3, XPU, ALL_LAYOUT, phi::MultiClassNMSKernel, float) {
kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
}
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/norm_kernel.h"
#include <vector>
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void NormKernel(const Context& ctx,
const DenseTensor& x,
int axis,
float epsilon,
bool is_test,
DenseTensor* out,
DenseTensor* norm) {
ctx.template Alloc<T>(out);
ctx.template Alloc<T>(norm);
std::vector<int> xshape;
auto x_dims = x.dims();
auto x_dims_size = x_dims.size();
xshape.resize(x_dims_size);
if (axis < 0) {
axis += x_dims_size;
}
PADDLE_ENFORCE_GE(
axis,
0,
phi::errors::InvalidArgument("axis must be greater than or equal to 0."
"But received axis: %d.",
axis));
PADDLE_ENFORCE_LT(axis,
x_dims_size,
phi::errors::InvalidArgument(
"Attr(axis) value must be less than rank of Input(X)"
"But received axis: %d, rank: %d.",
axis,
x_dims_size));
for (int i = 0; i < x_dims_size; i++) {
xshape[i] = static_cast<int>(x_dims[i]);
}
int r = xpu::l2_norm<T>(ctx.x_context(),
x.data<T>(),
out->data<T>(),
norm->data<T>(),
xshape,
axis,
epsilon);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "l2_norm");
}
} // namespace phi
PD_REGISTER_KERNEL(norm, XPU, ALL_LAYOUT, phi::NormKernel, float) {}
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import unittest
import numpy as np
from get_test_cover_info import (
XPUOpTestWrapper,
create_test_class,
get_xpu_op_support_types,
)
from op_test_xpu import XPUOpTest
import paddle
paddle.enable_static()
def softmax(x):
# clip to shiftx, otherwise, when calc loss with
# log(exp(shiftx)), may get log(0)=INF
shiftx = (x - np.max(x)).clip(-64.0)
exps = np.exp(shiftx)
return exps / np.sum(exps)
def iou(box_a, box_b, norm):
"""Apply intersection-over-union overlap between box_a and box_b."""
xmin_a = min(box_a[0], box_a[2])
ymin_a = min(box_a[1], box_a[3])
xmax_a = max(box_a[0], box_a[2])
ymax_a = max(box_a[1], box_a[3])
xmin_b = min(box_b[0], box_b[2])
ymin_b = min(box_b[1], box_b[3])
xmax_b = max(box_b[0], box_b[2])
ymax_b = max(box_b[1], box_b[3])
area_a = (ymax_a - ymin_a + (not norm)) * (xmax_a - xmin_a + (not norm))
area_b = (ymax_b - ymin_b + (not norm)) * (xmax_b - xmin_b + (not norm))
if area_a <= 0 and area_b <= 0:
return 0.0
xa = max(xmin_a, xmin_b)
ya = max(ymin_a, ymin_b)
xb = min(xmax_a, xmax_b)
yb = min(ymax_a, ymax_b)
inter_area = max(xb - xa + (not norm), 0.0) * max(yb - ya + (not norm), 0.0)
iou_ratio = inter_area / (area_a + area_b - inter_area)
return iou_ratio
def nms(
boxes,
scores,
score_threshold,
nms_threshold,
top_k=200,
normalized=True,
eta=1.0,
):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
score_threshold: (float) The confidence thresh for filtering low
confidence boxes.
nms_threshold: (float) The overlap thresh for suppressing unnecessary
boxes.
top_k: (int) The maximum number of box preds to consider.
eta: (float) The parameter for adaptive NMS.
Return:
The indices of the kept boxes with respect to num_priors.
"""
all_scores = copy.deepcopy(scores)
all_scores = all_scores.flatten()
selected_indices = np.argwhere(all_scores > score_threshold)
selected_indices = selected_indices.flatten()
all_scores = all_scores[selected_indices]
sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
sorted_scores = all_scores[sorted_indices]
sorted_indices = selected_indices[sorted_indices]
if top_k > -1 and top_k < sorted_indices.shape[0]:
sorted_indices = sorted_indices[:top_k]
sorted_scores = sorted_scores[:top_k]
selected_indices = []
adaptive_threshold = nms_threshold
for i in range(sorted_scores.shape[0]):
idx = sorted_indices[i]
keep = True
for k in range(len(selected_indices)):
if keep:
kept_idx = selected_indices[k]
overlap = iou(boxes[idx], boxes[kept_idx], normalized)
keep = True if overlap <= adaptive_threshold else False
else:
break
if keep:
selected_indices.append(idx)
if keep and eta < 1 and adaptive_threshold > 0.5:
adaptive_threshold *= eta
return selected_indices
def multiclass_nms(
boxes,
scores,
background,
score_threshold,
nms_threshold,
nms_top_k,
keep_top_k,
normalized,
shared,
):
if shared:
class_num = scores.shape[0]
priorbox_num = scores.shape[1]
else:
box_num = scores.shape[0]
class_num = scores.shape[1]
selected_indices = {}
num_det = 0
for c in range(class_num):
if c == background:
continue
if shared:
indices = nms(
boxes,
scores[c],
score_threshold,
nms_threshold,
nms_top_k,
normalized,
)
else:
indices = nms(
boxes[:, c, :],
scores[:, c],
score_threshold,
nms_threshold,
nms_top_k,
normalized,
)
selected_indices[c] = indices
num_det += len(indices)
if keep_top_k > -1 and num_det > keep_top_k:
score_index = []
for c, indices in selected_indices.items():
for idx in indices:
if shared:
score_index.append((scores[c][idx], c, idx))
else:
score_index.append((scores[idx][c], c, idx))
sorted_score_index = sorted(
score_index, key=lambda tup: tup[0], reverse=True
)
sorted_score_index = sorted_score_index[:keep_top_k]
selected_indices = {}
for _, c, _ in sorted_score_index:
selected_indices[c] = []
for s, c, idx in sorted_score_index:
selected_indices[c].append(idx)
if not shared:
for labels in selected_indices:
selected_indices[labels].sort()
num_det = keep_top_k
return selected_indices, num_det
def batched_multiclass_nms(
boxes,
scores,
background,
score_threshold,
nms_threshold,
nms_top_k,
keep_top_k,
normalized=True,
gpu_logic=False,
):
batch_size = scores.shape[0]
num_boxes = scores.shape[2]
det_outs = []
index_outs = []
lod = []
for n in range(batch_size):
nmsed_outs, nmsed_num = multiclass_nms(
boxes[n],
scores[n],
background,
score_threshold,
nms_threshold,
nms_top_k,
keep_top_k,
normalized,
shared=True,
)
lod.append(nmsed_num)
if nmsed_num == 0:
continue
tmp_det_out = []
for c, indices in nmsed_outs.items():
for idx in indices:
xmin, ymin, xmax, ymax = boxes[n][idx][:]
tmp_det_out.append(
[
c,
scores[n][c][idx],
xmin,
ymin,
xmax,
ymax,
idx + n * num_boxes,
]
)
if gpu_logic:
sorted_det_out = sorted(
tmp_det_out, key=lambda tup: tup[1], reverse=True
)
else:
sorted_det_out = sorted(
tmp_det_out, key=lambda tup: tup[0], reverse=False
)
det_outs.extend(sorted_det_out)
return det_outs, lod
class TestIOU(unittest.TestCase):
def test_iou(self):
box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
expt_output = np.array([2.0 / 16.0]).astype('float32')
calc_output = np.array([iou(box1, box2, True)]).astype('float32')
np.testing.assert_allclose(calc_output, expt_output, rtol=1e-05)
class XPUTestMulticlassNMS3Op(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'multiclass_nms3'
self.use_dynamic_create_class = False
class TestXpuMulticlassNMS3Op(XPUOpTest):
def set_argument(self):
self.score_threshold = 0.01
def setUp(self):
self.op_type = "multiclass_nms3"
self.dtype = self.in_type
self.set_argument()
N = 7
M = 1200
C = 21
BOX_SIZE = 4
background = 0
nms_threshold = 0.3
nms_top_k = 400
keep_top_k = (
200 if not hasattr(self, 'keep_top_k') else self.keep_top_k
)
score_threshold = self.score_threshold
scores = np.random.random((N * M, C)).astype(self.dtype)
scores = np.apply_along_axis(softmax, 1, scores)
scores = np.reshape(scores, (N, M, C))
scores = np.transpose(scores, (0, 2, 1))
boxes = np.random.random((N, M, BOX_SIZE)).astype(self.dtype)
boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
det_outs, lod = batched_multiclass_nms(
boxes,
scores,
background,
score_threshold,
nms_threshold,
nms_top_k,
keep_top_k,
gpu_logic=self.gpu_logic
if hasattr(self, 'gpu_logic')
else None,
)
det_outs = np.array(det_outs)
nmsed_outs = (
det_outs[:, :-1].astype(self.dtype)
if len(det_outs)
else np.array([], dtype=np.float32).reshape([0, BOX_SIZE + 2])
)
index_outs = (
det_outs[:, -1:].astype('int')
if len(det_outs)
else np.array([], dtype='int').reshape([0, 1])
)
self.inputs = {'BBoxes': boxes, 'Scores': scores}
self.outputs = {
'Out': nmsed_outs,
'Index': index_outs,
'NmsRoisNum': np.array(lod).astype('int32'),
}
self.attrs = {
'background_label': 0,
'nms_threshold': nms_threshold,
'nms_top_k': nms_top_k,
'keep_top_k': keep_top_k,
'score_threshold': score_threshold,
'nms_eta': 1.0,
'normalized': True,
}
def test_check_output(self):
self.check_output_with_place(paddle.XPUPlace(0))
support_types = get_xpu_op_support_types('multiclass_nms3')
for stype in support_types:
create_test_class(globals(), XPUTestMulticlassNMS3Op, stype)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from get_test_cover_info import (
XPUOpTestWrapper,
create_test_class,
get_xpu_op_support_types,
)
from op_test_xpu import XPUOpTest
import paddle
paddle.enable_static()
def l2_norm(x, axis, epsilon):
x2 = x**2
s = np.sum(x2, axis=axis, keepdims=True)
r = np.sqrt(s + epsilon)
y = x / np.broadcast_to(r, x.shape)
return y, r
class XPUTestNormOp(XPUOpTestWrapper):
def __init__(self):
self.op_name = "norm"
self.use_dynamic_create_class = False
class TestXPUNormOp(XPUOpTest):
def setUp(self):
self.op_type = "norm"
self.dtype = self.in_type
self.place = paddle.XPUPlace(0)
self.init_test_case()
x = np.random.random(self.shape).astype(self.dtype)
y, norm = l2_norm(x, self.axis, self.epsilon)
self.inputs = {'X': x}
self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
self.outputs = {'Out': y, 'Norm': norm}
def init_test_case(self):
self.shape = [2, 3, 4, 5]
self.axis = 1
self.epsilon = 1e-8
def test_check_output(self):
self.check_output_with_place(self.place)
def test_check_grad(self):
self.check_grad_with_place(self.place, ['X'], 'Out')
class TestXPUNormOp2(TestXPUNormOp):
def init_test_case(self):
self.shape = [5, 3, 9, 7]
self.axis = 0
self.epsilon = 1e-8
class TestXPUNormOp3(TestXPUNormOp):
def init_test_case(self):
self.shape = [5, 3, 2, 7]
self.axis = -1
self.epsilon = 1e-8
class TestXPUNormOp4(TestXPUNormOp):
def init_test_case(self):
self.shape = [128, 1024, 14, 14]
self.axis = 2
self.epsilon = 1e-8
class TestXPUNormOp5(TestXPUNormOp):
def init_test_case(self):
self.shape = [2048, 2048]
self.axis = 1
self.epsilon = 1e-8
support_types = get_xpu_op_support_types('norm')
for stype in support_types:
create_test_class(globals(), XPUTestNormOp, stype)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册