未验证 提交 bed4fb27 编写于 作者: Z zhulei 提交者: GitHub

[NPU] Add density_prior_box (#36361)

* [NPU] Add density_prior_box op

* [NPU] Add density_prior_box op
上级 5d18967b
......@@ -17,14 +17,15 @@ endfunction()
if (WITH_ASCEND_CL)
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc)
else()
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
endif()
detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
detection_library(anchor_generator_op SRCS anchor_generator_op.cc
anchor_generator_op.cu)
detection_library(target_assign_op SRCS target_assign_op.cc
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/density_prior_box_op.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using fp16 = paddle::platform::float16;
template <typename T>
struct DensityPriorBoxFunction {
public:
explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx)
: ctx(ctx) {
place = ctx.GetPlace();
stream = ctx.template device_context<platform::NPUDeviceContext>().stream();
t0.mutable_data<float>({1}, place);
t1.mutable_data<float>({1}, place);
tn.mutable_data<float>({1}, place);
FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
}
void Arange(int n, Tensor* x) {
// x should be init first
FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
runner.Run(stream);
}
void Add(const Tensor* x, const Tensor* y, Tensor* z) {
// z should be init first
const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
runner.Run(stream);
}
void Cast(const Tensor* x, Tensor* y) {
auto dst_dtype = ConvertToNpuDtype(y->type());
const auto& runner = NpuOpRunner(
"Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner.Run(stream);
}
void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
// z should be init first
const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
runner.Run(stream);
}
void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
// y should be init first
const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
runner.Run(stream);
}
void Adds(const Tensor* x, float scalar, Tensor* y) {
// y should be init first
const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
runner.Run(stream);
}
void Muls(const Tensor* x, float scalar, Tensor* y) {
// y should be init first
const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
runner.Run(stream);
}
void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
// y should be init first
const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
runner.Run(stream);
}
void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
// y should be init first
const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
runner.Run(stream);
}
void Concat(const std::vector<Tensor>& inputs, int axis, Tensor* output) {
// output should be init first
std::vector<std::string> names;
for (size_t i = 0; i < inputs.size(); i++) {
names.push_back("x" + std::to_string(i));
}
NpuOpRunner runner{
"ConcatD",
{inputs},
{*output},
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
runner.AddInputNames(names);
runner.Run(stream);
}
void Tile(const Tensor* x, Tensor* y, const std::vector<int>& multiples) {
// y should be init first
if (x->dims() == y->dims()) {
framework::TensorCopy(
*x, place, ctx.template device_context<platform::NPUDeviceContext>(),
y);
return;
}
const auto& runner =
NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}});
runner.Run(stream);
}
void FloatVec2Tsr(const std::vector<float>& vec, Tensor* tsr_dst) {
//
framework::TensorFromVector<T>(vec, ctx.device_context(), tsr_dst);
ctx.template device_context<platform::NPUDeviceContext>().Wait();
}
private:
platform::Place place;
aclrtStream stream;
const framework::ExecutionContext& ctx;
Tensor t0;
Tensor t1;
Tensor tn;
};
template <>
void DensityPriorBoxFunction<fp16>::Arange(int n, Tensor* x) {
Tensor x_fp32(framework::proto::VarType::FP32);
x_fp32.mutable_data<float>(x->dims(), place);
FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
runner.Run(stream);
Cast(&x_fp32, x);
}
template <>
void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
Tensor* tsr_dst) {
Tensor tsr_fp32(framework::proto::VarType::FP32);
tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
Cast(&tsr_fp32, tsr_dst);
}
template <typename T>
class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<paddle::framework::Tensor>("Input");
auto* image = ctx.Input<paddle::framework::Tensor>("Image");
auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
auto variances = ctx.Attr<std::vector<float>>("variances");
auto clip = ctx.Attr<bool>("clip");
auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
auto densities = ctx.Attr<std::vector<int>>("densities");
float step_w = ctx.Attr<float>("step_w");
float step_h = ctx.Attr<float>("step_h");
float offset = ctx.Attr<float>("offset");
int image_w = image->dims()[3];
int image_h = image->dims()[2];
int layer_w = input->dims()[3];
int layer_h = input->dims()[2];
auto _type = input->type();
auto place = ctx.GetPlace();
DensityPriorBoxFunction<T> F(ctx);
Tensor h(_type);
h.mutable_data<T>({layer_h}, place);
Tensor w(_type);
w.mutable_data<T>({layer_w}, place);
F.Arange(layer_h, &h);
F.Arange(layer_w, &w);
h.Resize({layer_h, 1, 1, 1});
w.Resize({1, layer_w, 1, 1});
step_w = step_w > 0 ? step_w : static_cast<float>(image_w) / layer_w;
step_h = step_h > 0 ? step_h : static_cast<float>(image_h) / layer_h;
int step_average = static_cast<int>((step_w + step_h) * 0.5);
int ratios_size = fixed_ratios.size();
int num_priors_per_ratio = 0;
for (size_t i = 0; i < densities.size(); ++i) {
num_priors_per_ratio += densities[i] * densities[i];
}
Tensor di(_type);
Tensor dj(_type);
Tensor shifts(_type);
Tensor box_w_ratio(_type);
Tensor box_h_ratio(_type);
di.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
dj.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
shifts.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
box_w_ratio.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
box_h_ratio.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
int64_t start = 0;
std::vector<int> vec_tile = {0, 0, 0};
for (size_t i = 0; i < densities.size(); ++i) {
// Range = start:start+ratios_size*density_sqr, density = densities[i]
int density_sqr = densities[i] * densities[i];
// shifts[Range] = [step_average/density]*ratios_size*density_sqr
Tensor shifts_part =
shifts.Slice(start, start + ratios_size * density_sqr);
FillNpuTensorWithConstant<T>(&shifts_part,
static_cast<T>(step_average / densities[i]));
// di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size
// dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size
Tensor di_part = di.Slice(start, start + ratios_size * density_sqr);
Tensor dj_part = dj.Slice(start, start + ratios_size * density_sqr);
if (densities[i] > 1) {
di_part.Resize({ratios_size, densities[i], densities[i]});
dj_part.Resize({ratios_size, densities[i], densities[i]});
Tensor range_n(_type);
range_n.mutable_data<T>({densities[i]}, place);
F.Arange(densities[i], &range_n);
range_n.Resize({1, densities[i], 1});
vec_tile[0] = ratios_size;
vec_tile[1] = 1;
vec_tile[2] = densities[i];
F.Tile(&range_n, &di_part, vec_tile);
range_n.Resize({1, 1, densities[i]});
vec_tile[1] = densities[i];
vec_tile[2] = 1;
F.Tile(&range_n, &dj_part, vec_tile);
} else {
FillNpuTensorWithConstant<T>(&di_part, static_cast<T>(0));
FillNpuTensorWithConstant<T>(&dj_part, static_cast<T>(0));
}
int start_box_ratio = start;
for (float ar : fixed_ratios) {
// Range_mini = start_box_ratio:start_box_ratio+density_sqr
// box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)] * density_sqr
// box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)] * density_sqr
Tensor box_h_ratio_part =
box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
Tensor box_w_ratio_part =
box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
FillNpuTensorWithConstant<T>(&box_w_ratio_part,
static_cast<T>(fixed_sizes[i] * sqrt(ar)));
FillNpuTensorWithConstant<T>(&box_h_ratio_part,
static_cast<T>(fixed_sizes[i] / sqrt(ar)));
start_box_ratio += density_sqr;
}
start = start_box_ratio;
}
di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
// c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts
// c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts
Tensor c_x(_type);
Tensor c_y(_type);
auto dim0 = framework::make_ddim(
{1, layer_w, ratios_size * num_priors_per_ratio, 1});
auto dim1 = framework::make_ddim(
{layer_h, 1, ratios_size * num_priors_per_ratio, 1});
c_x.mutable_data<T>(dim0, place);
c_y.mutable_data<T>(dim1, place);
F.Adds(&w, offset, &w);
F.Muls(&w, step_w, &w);
F.Adds(&w, static_cast<float>(-step_average) * static_cast<float>(0.5), &w);
F.Adds(&h, offset, &h);
F.Muls(&h, step_h, &h);
F.Adds(&h, static_cast<float>(-step_average) * static_cast<float>(0.5), &h);
F.Mul(&di, &shifts, &di);
F.Mul(&dj, &shifts, &dj);
F.Muls(&shifts, static_cast<float>(0.5), &shifts);
F.Add(&di, &shifts, &di);
F.Add(&dj, &shifts, &dj);
F.Add(&dj, &w, &c_x);
F.Add(&di, &h, &c_y);
// box_w_ratio = box_w_ratio / 2
// box_h_ratio = box_h_ratio / 2
F.Muls(&box_w_ratio, static_cast<float>(0.5), &box_w_ratio);
F.Muls(&box_h_ratio, static_cast<float>(0.5), &box_h_ratio);
Tensor zero_t(_type);
Tensor one_t(_type);
zero_t.mutable_data<T>({1}, place);
one_t.mutable_data<T>({1}, place);
FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
Tensor outbox0(_type);
Tensor outbox1(_type);
Tensor outbox2(_type);
Tensor outbox3(_type);
outbox0.mutable_data<T>(dim0, place);
outbox1.mutable_data<T>(dim1, place);
outbox2.mutable_data<T>(dim0, place);
outbox3.mutable_data<T>(dim1, place);
// outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 )
// outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 )
// outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 )
// outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 )
F.Sub(&c_x, &box_w_ratio, &outbox0);
F.Sub(&c_y, &box_h_ratio, &outbox1);
F.Add(&c_x, &box_w_ratio, &outbox2);
F.Add(&c_y, &box_h_ratio, &outbox3);
F.Muls(&outbox0, static_cast<float>(1.0 / image_w), &outbox0);
F.Muls(&outbox1, static_cast<float>(1.0 / image_h), &outbox1);
F.Muls(&outbox2, static_cast<float>(1.0 / image_w), &outbox2);
F.Muls(&outbox3, static_cast<float>(1.0 / image_h), &outbox3);
F.Maximum(&outbox0, &zero_t, &outbox0);
F.Maximum(&outbox1, &zero_t, &outbox1);
F.Minimum(&outbox2, &one_t, &outbox2);
F.Minimum(&outbox3, &one_t, &outbox3);
if (clip) {
// outbox0 = min ( outbox0, 1 )
// outbox1 = min ( outbox1, 1 )
// outbox2 = max ( outbox2, 0 )
// outbox3 = max ( outbox3, 0 )
F.Minimum(&outbox0, &one_t, &outbox0);
F.Minimum(&outbox1, &one_t, &outbox1);
F.Maximum(&outbox2, &zero_t, &outbox2);
F.Maximum(&outbox3, &zero_t, &outbox3);
}
auto out_dim = framework::make_ddim(
{layer_h, layer_w, ratios_size * num_priors_per_ratio, 4});
boxes->mutable_data<T>(place);
vars->mutable_data<T>(place);
Tensor boxes_share(_type);
Tensor vars_share(_type);
boxes_share.ShareDataWith(*boxes);
boxes_share.Resize(out_dim);
vars_share.ShareDataWith(*vars);
vars_share.Resize(out_dim);
Tensor box0(_type);
Tensor box1(_type);
Tensor box2(_type);
Tensor box3(_type);
// out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1}
out_dim[3] = 1;
box0.mutable_data<T>(out_dim, place);
box1.mutable_data<T>(out_dim, place);
box2.mutable_data<T>(out_dim, place);
box3.mutable_data<T>(out_dim, place);
std::vector<int> vec_exp_out02 = {layer_h, 1, 1, 1};
std::vector<int> vec_exp_out13 = {1, layer_w, 1, 1};
F.Tile(&outbox0, &box0, vec_exp_out02);
F.Tile(&outbox1, &box1, vec_exp_out13);
F.Tile(&outbox2, &box2, vec_exp_out02);
F.Tile(&outbox3, &box3, vec_exp_out13);
F.Concat({box0, box1, box2, box3}, 3, &boxes_share);
std::vector<int> multiples = {layer_h, layer_w,
ratios_size * num_priors_per_ratio, 1};
Tensor variances_t(_type);
// variances.size() == 4
variances_t.mutable_data<T>({4}, place);
F.FloatVec2Tsr(variances, &variances_t);
F.Tile(&variances_t, &vars_share, multiples);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(density_prior_box,
ops::DensityPriorBoxOpNPUKernel<plat::float16>,
ops::DensityPriorBoxOpNPUKernel<float>);
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import sys
sys.path.append("..")
import math
import paddle
from op_test import OpTest
paddle.enable_static()
np.random.seed(2021)
class TestNpuDensityPriorBoxOp(OpTest):
def set_data(self):
self.init_test_params()
self.init_test_input()
self.init_test_output()
#self.init_test_output2()
self.inputs = {'Input': self.input, 'Image': self.image}
self.attrs = {
'variances': self.variances,
'clip': self.clip,
'step_w': self.step_w,
'step_h': self.step_h,
'offset': self.offset,
'densities': self.densities,
'fixed_sizes': self.fixed_sizes,
'fixed_ratios': self.fixed_ratios,
'flatten_to_2d': self.flatten_to_2d
}
self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
def test_check_output(self):
self.check_output_with_place(self.place, atol=self.atol)
def setUp(self):
self.__class__.use_npu = True
self.op_type = 'density_prior_box'
self.place = paddle.NPUPlace(0)
self.init_dtype()
self.set_data()
def init_dtype(self):
self.dtype = np.float32
def set_density(self):
self.densities = [4, 2, 1]
self.fixed_sizes = [32.0, 64.0, 128.0]
self.fixed_ratios = [1.0]
self.layer_w = 17
self.layer_h = 17
self.image_w = 533
self.image_h = 533
self.flatten_to_2d = False
def init_test_params(self):
self.set_density()
self.step_w = float(self.image_w) / float(self.layer_w)
self.step_h = float(self.image_h) / float(self.layer_h)
self.input_channels = 2
self.image_channels = 3
self.batch_size = 10
self.variances = [0.1, 0.1, 0.2, 0.2]
self.variances = np.array(self.variances, dtype=np.float).flatten()
self.clip = True
self.num_priors = 0
if len(self.fixed_sizes) > 0 and len(self.densities) > 0:
for density in self.densities:
if len(self.fixed_ratios) > 0:
self.num_priors += len(self.fixed_ratios) * (pow(density,
2))
self.offset = 0.5
self.atol = 1e-5
def init_test_input(self):
self.image = np.random.random(
(self.batch_size, self.image_channels, self.image_h,
self.image_w)).astype(self.dtype)
self.input = np.random.random(
(self.batch_size, self.input_channels, self.layer_h,
self.layer_w)).astype(self.dtype)
def init_test_output(self):
out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
out_boxes = np.zeros(out_dim).astype(self.dtype)
out_var = np.zeros(out_dim).astype(self.dtype)
step_average = int((self.step_w + self.step_h) * 0.5)
for h in range(self.layer_h):
for w in range(self.layer_w):
idx = 0
c_x = (w + self.offset) * self.step_w
c_y = (h + self.offset) * self.step_h
# Generate density prior boxes with fixed size
for density, fixed_size in zip(self.densities,
self.fixed_sizes):
if (len(self.fixed_ratios) > 0):
for ar in self.fixed_ratios:
shift = int(step_average / density)
box_width_ratio = fixed_size * math.sqrt(ar)
box_height_ratio = fixed_size / math.sqrt(ar)
for di in range(density):
for dj in range(density):
c_x_temp = c_x - step_average / 2.0 + shift / 2.0 + dj * shift
c_y_temp = c_y - step_average / 2.0 + shift / 2.0 + di * shift
out_boxes[h, w, idx, :] = [
max((c_x_temp - box_width_ratio / 2.0) /
self.image_w, 0),
max((c_y_temp - box_height_ratio / 2.0)
/ self.image_h, 0),
min((c_x_temp + box_width_ratio / 2.0) /
self.image_w, 1),
min((c_y_temp + box_height_ratio / 2.0)
/ self.image_h, 1)
]
idx += 1
if self.clip:
out_boxes = np.clip(out_boxes, 0.0, 1.0)
out_var = np.tile(self.variances,
(self.layer_h, self.layer_w, self.num_priors, 1))
self.out_boxes = out_boxes.astype(self.dtype)
self.out_var = out_var.astype(self.dtype)
if self.flatten_to_2d:
self.out_boxes = self.out_boxes.reshape((-1, 4))
self.out_var = self.out_var.reshape((-1, 4))
class TestNpuDensityPriorBoxFlatten(TestNpuDensityPriorBoxOp):
def set_density(self):
self.densities = [3, 4]
self.fixed_sizes = [1.0, 2.0]
self.fixed_ratios = [1.0]
self.layer_w = 32
self.layer_h = 32
self.image_w = 40
self.image_h = 40
self.flatten_to_2d = True
class TestNpuDensityPriorBoxOp1(TestNpuDensityPriorBoxOp):
def set_density(self):
super(TestNpuDensityPriorBoxOp1, self).set_density()
self.layer_w = 1
self.layer_h = 1
class TestNpuDensityPriorBoxOp2(TestNpuDensityPriorBoxOp):
def set_density(self):
super(TestNpuDensityPriorBoxOp2, self).set_density()
self.layer_w = 15
self.layer_h = 17
self.image_w = 533
self.image_h = 532
class TestNpuDensityPriorBoxOp3(TestNpuDensityPriorBoxOp):
def set_density(self):
super(TestNpuDensityPriorBoxOp3, self).set_density()
self.fixed_ratios = [1.0, 4.0]
class TestNpuDensityPriorBoxOpFP16(TestNpuDensityPriorBoxOp):
def init_dtype(self):
self.dtype = np.float16
def init_test_params(self):
super(TestNpuDensityPriorBoxOpFP16, self).init_test_params()
self.atol = 1e-3
self.clip = False
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册