“d4c41139df6e74c6fff0cbac43e51cab782133be”上不存在“drivers/net/ethernet/aeroflex/greth.c”
未验证 提交 7e4290c5 编写于 作者: H hong19860320 提交者: GitHub

[XPU] Fix yolo_box to support multi-stream based inference (#55310)

上级 76b77d81
...@@ -41,17 +41,13 @@ void Conv2dTransposeXPUKernel(const Context& ctx, ...@@ -41,17 +41,13 @@ void Conv2dTransposeXPUKernel(const Context& ctx,
DenseTensor* out_max) { DenseTensor* out_max) {
using XPUT = typename XPUTypeTrait<T>::Type; using XPUT = typename XPUTypeTrait<T>::Type;
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
DenseTensor filter_ = filter;
ctx.template Alloc<T>(out); ctx.template Alloc<T>(out);
ctx.template Alloc<float>(out_max); ctx.template Alloc<float>(out_max);
bool is_nchw; bool is_nchw;
is_nchw = (data_format == "NHWC") ? false : true; is_nchw = (data_format == "NHWC") ? false : true;
DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size()); // hw DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size()); // hw
DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size()); DDim filter_data_dims = slice_ddim(filter.dims(), 2, filter.dims().size());
std::vector<int> ksize = vectorize<int>(filter_data_dims); std::vector<int> ksize = vectorize<int>(filter_data_dims);
std::vector<int> paddings_ = paddings; std::vector<int> paddings_ = paddings;
std::vector<int> dilations_ = dilations; std::vector<int> dilations_ = dilations;
...@@ -78,7 +74,7 @@ void Conv2dTransposeXPUKernel(const Context& ctx, ...@@ -78,7 +74,7 @@ void Conv2dTransposeXPUKernel(const Context& ctx,
int r = xpu::conv2d_transpose_fusion_v2<XPUT, int16_t, XPUT, int16_t>( int r = xpu::conv2d_transpose_fusion_v2<XPUT, int16_t, XPUT, int16_t>(
ctx.x_context(), ctx.x_context(),
reinterpret_cast<const XPUT*>(x.data<T>()), reinterpret_cast<const XPUT*>(x.data<T>()),
filter_.data<int16_t>(), filter.data<int16_t>(),
reinterpret_cast<XPUT*>(out->data<T>()), reinterpret_cast<XPUT*>(out->data<T>()),
batch_size, batch_size,
img_yc, img_yc,
......
...@@ -38,36 +38,33 @@ void YoloBoxXPUKernel(const Context& ctx, ...@@ -38,36 +38,33 @@ void YoloBoxXPUKernel(const Context& ctx,
const float* stride_data; const float* stride_data;
const float* anchor_grid_data; const float* anchor_grid_data;
// fix precision of fp16 model // fix precision of fp16 model
xpu::ctx_guard RAII_GUARD(ctx.x_context());
if (std::is_same<T, phi::dtype::float16>::value) { if (std::is_same<T, phi::dtype::float16>::value) {
DenseTensor grid_data_fp32_t; float* grid_data_temp = RAII_GUARD.alloc_l3_or_gm<float>(grid.numel());
DenseTensor stride_data_fp32_t; int r = xpu::cast<XPUType, float>(
DenseTensor anchor_grid_data_fp32_t;
ctx.template Alloc<float>(&grid_data_fp32_t, grid.numel() * sizeof(float));
int r1 = xpu::cast<XPUType, float>(
ctx.x_context(), ctx.x_context(),
reinterpret_cast<const XPUType*>(grid.data<T>()), reinterpret_cast<const XPUType*>(grid.data<T>()),
grid_data_fp32_t.data<float>(), grid_data_temp,
grid.numel()); grid.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r1, "cast"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
ctx.template Alloc<float>(&stride_data_fp32_t, float* stride_data_temp = RAII_GUARD.alloc_l3_or_gm<float>(stride.numel());
stride.numel() * sizeof(float)); r = xpu::cast<XPUType, float>(
int r2 = xpu::cast<XPUType, float>(
ctx.x_context(), ctx.x_context(),
reinterpret_cast<const XPUType*>(stride.data<T>()), reinterpret_cast<const XPUType*>(stride.data<T>()),
stride_data_fp32_t.data<float>(), stride_data_temp,
stride.numel()); stride.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r2, "cast"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
ctx.template Alloc<float>(&anchor_grid_data_fp32_t, float* anchor_grid_data_temp =
anchor_grid.numel() * sizeof(float)); RAII_GUARD.alloc_l3_or_gm<float>(anchor_grid.numel());
int r3 = xpu::cast<XPUType, float>( r = xpu::cast<XPUType, float>(
ctx.x_context(), ctx.x_context(),
reinterpret_cast<const XPUType*>(anchor_grid.data<T>()), reinterpret_cast<const XPUType*>(anchor_grid.data<T>()),
anchor_grid_data_fp32_t.data<float>(), anchor_grid_data_temp,
anchor_grid.numel()); anchor_grid.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r3, "cast"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
grid_data = grid_data_fp32_t.data<float>(); grid_data = grid_data_temp;
stride_data = stride_data_fp32_t.data<float>(); stride_data = stride_data_temp;
anchor_grid_data = anchor_grid_data_fp32_t.data<float>(); anchor_grid_data = anchor_grid_data_temp;
} else { } else {
grid_data = grid.data<float>(); grid_data = grid.data<float>();
stride_data = stride.data<float>(); stride_data = stride.data<float>();
......
...@@ -53,11 +53,6 @@ void Conv2dTransposeKernel(const Context& ctx, ...@@ -53,11 +53,6 @@ void Conv2dTransposeKernel(const Context& ctx,
DenseTensor* out) { DenseTensor* out) {
using XPUT = typename XPUTypeTrait<T>::Type; using XPUT = typename XPUTypeTrait<T>::Type;
// The filter will be reshaped in the calculations,
// so here use an assignment operation,
// that avoids modifying the variable in the Scope.
DenseTensor filter_ = filter;
ctx.template Alloc<T>(out); ctx.template Alloc<T>(out);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -67,7 +62,7 @@ void Conv2dTransposeKernel(const Context& ctx, ...@@ -67,7 +62,7 @@ void Conv2dTransposeKernel(const Context& ctx,
("XPU do support data_format is NCHW in conv_transpose op."))); ("XPU do support data_format is NCHW in conv_transpose op.")));
DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size()); DDim in_data_dims = slice_ddim(x.dims(), 2, x.dims().size());
DDim filter_data_dims = slice_ddim(filter_.dims(), 2, filter_.dims().size()); DDim filter_data_dims = slice_ddim(filter.dims(), 2, filter.dims().size());
std::vector<int> ksize = vectorize<int>(filter_data_dims); std::vector<int> ksize = vectorize<int>(filter_data_dims);
std::vector<int> paddings_ = paddings; std::vector<int> paddings_ = paddings;
...@@ -86,7 +81,7 @@ void Conv2dTransposeKernel(const Context& ctx, ...@@ -86,7 +81,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>( int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>(
ctx.x_context(), ctx.x_context(),
x.data<float>(), x.data<float>(),
filter_.data<float>(), filter.data<float>(),
out->data<float>(), out->data<float>(),
batch_size, batch_size,
img_yc, img_yc,
...@@ -107,7 +102,7 @@ void Conv2dTransposeKernel(const Context& ctx, ...@@ -107,7 +102,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose_v2<float, float, float, float>( int r = xpu::conv2d_transpose_v2<float, float, float, float>(
ctx.x_context(), ctx.x_context(),
x.data<float>(), x.data<float>(),
filter_.data<float>(), filter.data<float>(),
out->data<float>(), out->data<float>(),
batch_size, batch_size,
img_yc, img_yc,
...@@ -132,7 +127,7 @@ void Conv2dTransposeKernel(const Context& ctx, ...@@ -132,7 +127,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>( int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>(
ctx.x_context(), ctx.x_context(),
x.data<float>(), x.data<float>(),
filter_.data<float>(), filter.data<float>(),
out->data<float>(), out->data<float>(),
batch_size, batch_size,
img_yc, img_yc,
...@@ -157,7 +152,7 @@ void Conv2dTransposeKernel(const Context& ctx, ...@@ -157,7 +152,7 @@ void Conv2dTransposeKernel(const Context& ctx,
int r = xpu::conv2d_transpose<float, float, float, int_with_ll_t>( int r = xpu::conv2d_transpose<float, float, float, int_with_ll_t>(
ctx.x_context(), ctx.x_context(),
x.data<float>(), x.data<float>(),
filter_.data<float>(), filter.data<float>(),
out->data<float>(), out->data<float>(),
batch_size, batch_size,
img_yc, img_yc,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册