未验证 提交 7ab0e336 编写于 作者: C Chenxiao Niu 提交者: GitHub

[MLU] transpose avg_pool2d to NHWC for better performance. (#44475)

上级 c3ba8056
...@@ -100,6 +100,25 @@ class MLUPoolOpKernel : public framework::OpKernel<T> { ...@@ -100,6 +100,25 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
cnnlPoolingMode_t pool_mode = cnnlPoolingMode_t pool_mode =
ToCnnlPoolingMode(pooling_type, exclusive, adaptive); ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
// transpose NCHW to NHWC since cnnl pool2d has worse performance in that
// layout.
framework::Tensor trans_in_x;
framework::Tensor trans_out;
if (channel_last) {
trans_in_x = *in_x;
trans_out = *out;
} else {
std::vector<int> perm{0, 2, 3, 1};
TransposeFromMLUTensor<T>(
ctx, perm, in_x, &trans_in_x, true /*need_reshape_or_alloc*/);
trans_out = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
{out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx);
}
MLUCnnlTensorDesc trans_in_x_desc(
trans_in_x, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(
trans_out, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
if (!adaptive) { if (!adaptive) {
MLUCnnlPoolingDesc pool_desc(pool_mode, MLUCnnlPoolingDesc pool_desc(pool_mode,
CNNL_NOT_PROPAGATE_NAN, CNNL_NOT_PROPAGATE_NAN,
...@@ -128,8 +147,8 @@ class MLUPoolOpKernel : public framework::OpKernel<T> { ...@@ -128,8 +147,8 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
{static_cast<int64_t>(extra_input_size)}, cpu_ctx); {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
cnnlInitPoolingExtraInput(handle, cnnlInitPoolingExtraInput(handle,
pool_desc.get(), pool_desc.get(),
in_x_desc.get(), trans_in_x_desc.get(),
out_desc.get(), trans_out_desc.get(),
GetBasePtr(&extra_host_tensor)); GetBasePtr(&extra_host_tensor));
framework::Tensor extra_device_tensor = framework::Tensor extra_device_tensor =
ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>( ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
...@@ -151,12 +170,12 @@ class MLUPoolOpKernel : public framework::OpKernel<T> { ...@@ -151,12 +170,12 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
out_w, out_w,
pool_desc.get(), pool_desc.get(),
nullptr /*alpha*/, nullptr /*alpha*/,
in_x_desc.get(), trans_in_x_desc.get(),
GetBasePtr(in_x), GetBasePtr(&trans_in_x),
nullptr /*beta*/, nullptr /*beta*/,
GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/, GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/,
out_desc.get(), trans_out_desc.get(),
GetBasePtr(out)); GetBasePtr(&trans_out));
} else { } else {
MLUCnnl::PoolingForward(ctx, MLUCnnl::PoolingForward(ctx,
pool_mode, pool_mode,
...@@ -164,31 +183,14 @@ class MLUPoolOpKernel : public framework::OpKernel<T> { ...@@ -164,31 +183,14 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
out_w, out_w,
pool_desc.get(), pool_desc.get(),
nullptr /*alpha*/, nullptr /*alpha*/,
in_x_desc.get(), trans_in_x_desc.get(),
GetBasePtr(in_x), GetBasePtr(&trans_in_x),
nullptr /*beta*/, nullptr /*beta*/,
nullptr /*params_shape_ptr*/, nullptr /*params_shape_ptr*/,
out_desc.get(), trans_out_desc.get(),
GetBasePtr(out)); GetBasePtr(&trans_out));
} }
} else { } else {
// cnnl Adaptive pooling only support NHWC layout
framework::Tensor trans_in_x;
framework::Tensor trans_out;
if (channel_last) {
trans_in_x = *in_x;
trans_out = *out;
} else {
std::vector<int> perm{0, 2, 3, 1};
TransposeFromMLUTensor<T>(
ctx, perm, in_x, &trans_in_x, true /*need_reshape_or_alloc*/);
trans_out = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
{out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx);
}
MLUCnnlTensorDesc trans_in_x_desc(
trans_in_x, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(
trans_out, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnl::AdaptivePoolingForward(ctx, MLUCnnl::AdaptivePoolingForward(ctx,
pool_mode, pool_mode,
trans_in_x_desc.get(), trans_in_x_desc.get(),
...@@ -197,11 +199,11 @@ class MLUPoolOpKernel : public framework::OpKernel<T> { ...@@ -197,11 +199,11 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
GetBasePtr(&trans_out), GetBasePtr(&trans_out),
nullptr, nullptr,
nullptr); nullptr);
if (!channel_last) { }
std::vector<int> perm{0, 3, 1, 2}; if (!channel_last) {
TransposeFromMLUTensor<T>( std::vector<int> perm{0, 3, 1, 2};
ctx, perm, &trans_out, out, false /*need_reshape_or_alloc*/); TransposeFromMLUTensor<T>(
} ctx, perm, &trans_out, out, false /*need_reshape_or_alloc*/);
} }
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册