未验证 提交 7e4290c5 编写于 作者: H hong19860320 提交者: GitHub

[XPU] Fix yolo_box to support multi-stream based inference (#55310)

上级 76b77d81
......@@ -41,17 +41,13 @@ void Conv2dTransposeXPUKernel(const Context& ctx,
DenseTensor* out_max) {
using XPUT = typename XPUTypeTrait<T>::Type;
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
DenseTensor filter_ = filter;
ctx.template Alloc<T>(out);
ctx.template Alloc<float>(out_max);
bool is_nchw;
is_nchw = (data_format == "NHWC") ? false : true;
DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size()); // hw
DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size());
DDim filter_data_dims = slice_ddim(filter.dims(), 2, filter.dims().size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations;
......@@ -78,7 +74,7 @@ void Conv2dTransposeXPUKernel(const Context& ctx,
int r = xpu::conv2d_transpose_fusion_v2<XPUT, int16_t, XPUT, int16_t>(
ctx.x_context(),
reinterpret_cast<const XPUT*>(x.data<T>()),
filter_.data<int16_t>(),
filter.data<int16_t>(),
reinterpret_cast<XPUT*>(out->data<T>()),
batch_size,
img_yc,
......
......@@ -38,36 +38,33 @@ void YoloBoxXPUKernel(const Context& ctx,
const float* stride_data;
const float* anchor_grid_data;
// fix precision of fp16 model
xpu::ctx_guard RAII_GUARD(ctx.x_context());
if (std::is_same<T, phi::dtype::float16>::value) {
DenseTensor grid_data_fp32_t;
DenseTensor stride_data_fp32_t;
DenseTensor anchor_grid_data_fp32_t;
ctx.template Alloc<float>(&grid_data_fp32_t, grid.numel() * sizeof(float));
int r1 = xpu::cast<XPUType, float>(
float* grid_data_temp = RAII_GUARD.alloc_l3_or_gm<float>(grid.numel());
int r = xpu::cast<XPUType, float>(
ctx.x_context(),
reinterpret_cast<const XPUType*>(grid.data<T>()),
grid_data_fp32_t.data<float>(),
grid_data_temp,
grid.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r1, "cast");
ctx.template Alloc<float>(&stride_data_fp32_t,
stride.numel() * sizeof(float));
int r2 = xpu::cast<XPUType, float>(
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
float* stride_data_temp = RAII_GUARD.alloc_l3_or_gm<float>(stride.numel());
r = xpu::cast<XPUType, float>(
ctx.x_context(),
reinterpret_cast<const XPUType*>(stride.data<T>()),
stride_data_fp32_t.data<float>(),
stride_data_temp,
stride.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r2, "cast");
ctx.template Alloc<float>(&anchor_grid_data_fp32_t,
anchor_grid.numel() * sizeof(float));
int r3 = xpu::cast<XPUType, float>(
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
float* anchor_grid_data_temp =
RAII_GUARD.alloc_l3_or_gm<float>(anchor_grid.numel());
r = xpu::cast<XPUType, float>(
ctx.x_context(),
reinterpret_cast<const XPUType*>(anchor_grid.data<T>()),
anchor_grid_data_fp32_t.data<float>(),
anchor_grid_data_temp,
anchor_grid.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r3, "cast");
grid_data = grid_data_fp32_t.data<float>();
stride_data = stride_data_fp32_t.data<float>();
anchor_grid_data = anchor_grid_data_fp32_t.data<float>();
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
grid_data = grid_data_temp;
stride_data = stride_data_temp;
anchor_grid_data = anchor_grid_data_temp;
} else {
grid_data = grid.data<float>();
stride_data = stride.data<float>();
......
......@@ -53,11 +53,6 @@ void Conv2dTransposeKernel(const Context& ctx,
DenseTensor* out) {
using XPUT = typename XPUTypeTrait<T>::Type;
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
DenseTensor filter_ = filter;
ctx.template Alloc<T>(out);
PADDLE_ENFORCE_EQ(
......@@ -67,7 +62,7 @@ void Conv2dTransposeKernel(const Context& ctx,
("XPU do support data_format is NCHW in conv_transpose op.")));
DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());
DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size());
DDim filter_data_dims = slice_ddim(filter.dims(), 2, filter.dims().size());
std::vector<int> ksize = vectorize<int>(filter_data_dims);
std::vector<int> paddings_ = paddings;
......@@ -86,7 +81,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>(
ctx.x_context(),
x.data<float>(),
filter_.data<float>(),
filter.data<float>(),
out->data<float>(),
batch_size,
img_yc,
......@@ -107,7 +102,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose_v2<float, float, float, float>(
ctx.x_context(),
x.data<float>(),
filter_.data<float>(),
filter.data<float>(),
out->data<float>(),
batch_size,
img_yc,
......@@ -132,7 +127,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>(
ctx.x_context(),
x.data<float>(),
filter_.data<float>(),
filter.data<float>(),
out->data<float>(),
batch_size,
img_yc,
......@@ -157,7 +152,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose<float, float, float, int_with_ll_t>(
ctx.x_context(),
x.data<float>(),
filter_.data<float>(),
filter.data<float>(),
out->data<float>(),
batch_size,
img_yc,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册