未验证 提交 48abaec6 编写于 作者: J jakpiase 提交者: GitHub

Performance fix for recommender model (#43803)

* fix for binary kernels

* fixed performance for elementwise, reduce and concat

* added comment

* CI fix

* CI fix

* added formatting

* reverted one file

* Revert "reverted one file"

This reverts commit 54725e1c62318d3a18913821200e973816751019.

* Revert "added formatting"

This reverts commit b9795dd253d755a329376d7ab0542860aa7815c6.

* added enforcing oneDNN BF16 reduce kernel

* fix for eltwise and reenabled reshape kernels

* fix for binary handler

* added formatting

* referted changes for flatten,squeeze and reshape ops
上级 f39183ea
...@@ -75,8 +75,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> { ...@@ -75,8 +75,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>(); ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
const auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
const auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out"); auto* z = ctx.Output<Tensor>("Out");
float scale_x = ctx.Attr<float>("Scale_x"); float scale_x = ctx.Attr<float>("Scale_x");
...@@ -96,6 +96,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> { ...@@ -96,6 +96,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
scale_o, scale_o,
get_post_ops(ctx)); get_post_ops(ctx));
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
if (x->numel() < y->numel()) {
std::swap(x, y);
}
const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_x_memory = handler.AcquireSrcMemory(x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y); const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
// (jczaja) For Inplace src and dst should be the same memory object. // (jczaja) For Inplace src and dst should be the same memory object.
...@@ -159,6 +165,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> { ...@@ -159,6 +165,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
if (x->numel() < y->numel()) {
std::swap(x, y);
std::swap(dx, dy);
}
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
auto tz = phi::vectorize<int64_t>(dout->dims()); auto tz = phi::vectorize<int64_t>(dout->dims());
......
...@@ -77,7 +77,24 @@ class ConcatMKLDNNHandler ...@@ -77,7 +77,24 @@ class ConcatMKLDNNHandler
} }
auto dst_dims = phi::vectorize<int64_t>(output->dims()); auto dst_dims = phi::vectorize<int64_t>(output->dims());
auto dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
dnnl::memory::desc dst_md;
// if concat is being used as a stack op(all source memories dims on
// concat_axis are equal to 1), then it may choose a non-optimal memory
// format tag for destination, because concat primitive is chosing it based
// on source memory descriptors and f.e.200x1x10 can be described as both
// abc and bac and both would be using exact same physical layout, but in
// that scenario bac will be chosen for destination no matter which
// formats are being set in inputs. In that scenario we are enforcing using
// a dense format, because it is the most common one and should be the best
// in terms of the performance
if (dst_dims[concat_axis] == static_cast<int64_t>(srcs_md.size())) {
dst_md = memory::desc(
dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size()));
} else {
dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
}
this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md); this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
} }
......
...@@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel { ...@@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel {
} }
} }
// oneDNN's reduction kernel is optimized only for reducing throughout the
// most outer dims, so in case of another type of reduction, it would be
// better to fallback to native implementation
static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
// native reduce kernels don't support bf16
// so oneDNN kernel is enforced in that case
if (ctx.Input<framework::LoDTensor>("X")->dtype() ==
experimental::DataType::BFLOAT16)
return true;
auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
const bool reduce_all = ctx.Attr<bool>("reduce_all");
int ndims = ctx.Input<framework::LoDTensor>("X")->dims().size();
if (reduce_all) {
return true;
}
for (size_t i = 0; i < reduce_dims.size(); ++i) {
if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
}
sort(reduce_dims.begin(), reduce_dims.end());
for (size_t i = 0; i < reduce_dims.size(); ++i) {
if (reduce_dims[reduce_dims.size() - i - 1] !=
static_cast<int>(ndims - i - 1)) {
return false;
}
}
return true;
}
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported. // choose cudnn kernel if the runtime supported.
...@@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel { ...@@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel {
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
HasOptimizedOneDNNKernel(ctx)) {
return framework::OpKernelType(input_data_type, return framework::OpKernelType(input_data_type,
ctx.GetPlace(), ctx.GetPlace(),
framework::DataLayout::kMKLDNN, framework::DataLayout::kMKLDNN,
......
...@@ -690,9 +690,14 @@ class BinaryMKLDNNHandler ...@@ -690,9 +690,14 @@ class BinaryMKLDNNHandler
auto attributes = auto attributes =
CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
if (x->numel() < y->numel()) {
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src1_md, src0_md, dst_md);
} else {
this->AcquireForwardPrimitiveDescriptor( this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src0_md, src1_md, dst_md); attributes, algo, src0_md, src1_md, dst_md);
} }
}
std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory( std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory(
const framework::Tensor* input) { const framework::Tensor* input) {
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
......
...@@ -68,6 +68,14 @@ class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp): ...@@ -68,6 +68,14 @@ class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
self.out = np.add(self.x, self.y) self.out = np.add(self.x, self.y)
class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp):
def init_input_output(self):
self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype)
self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype)
self.out = np.add(self.x, self.y)
class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp): class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
def init_input_output(self): def init_input_output(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册