diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 4e25947d7d5a6c659d5e359b60618351ddff4eeb..72f8c9c71cf8253fa808ef2d6090206cc0c3778b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -95,7 +95,13 @@ class ElementwiseOp : public framework::OperatorWithKernel { auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); #ifdef PADDLE_WITH_MKLDNN - if (platform::CanMKLDNNBeUsed(ctx)) { + // If broadcasting is needed, use native implementation + auto CanMKLDNNElementwiseAddBeUsed = [&]() { + return ctx.Input("X")->dims() == ctx.Input("Y")->dims(); + }; + + if (platform::CanMKLDNNBeUsed(ctx) && + (ctx.Type() != "elementwise_add" || CanMKLDNNElementwiseAddBeUsed())) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, framework::LibraryType::kMKLDNN); @@ -227,7 +233,16 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN - if (platform::CanMKLDNNBeUsed(ctx)) { + // If broadcasting is needed, use native implementation + auto CanMKLDNNElementwiseAddGradBeUsed = [&]() { + auto dx = ctx.Output(framework::GradVarName("X")); + auto dy = ctx.Output(framework::GradVarName("Y")); + return (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()); + }; + + if (platform::CanMKLDNNBeUsed(ctx) && + (ctx.Type() != "elementwise_add_grad" || + CanMKLDNNElementwiseAddGradBeUsed())) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, framework::LibraryType::kMKLDNN); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 3490353c7558d6b793548773cac39048379155f0..72d2855ad46b2c44959596d0c03e974fe32f87cf 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -41,136 +41,58 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); - const T* x_data = x->data(); - const T* y_data = y->data(); - - int axis = ctx.Attr("axis"); - - auto x_dims = x->dims(); - auto y_dims_untrimed = y->dims(); - auto z_dims = z->dims(); - - mkldnn::stream astream(mkldnn_engine); - - // Execute default elementwise_add operator when - // broadcast operations need to performed. - if (x_dims != y_dims_untrimed) { - Tensor _x; - MKLDNNMemoryFormat format; - auto src_x_tz = framework::vectorize(x_dims); - - if ((src_x_tz.size() == 3 && - x->format() != (format = MKLDNNMemoryFormat::ncw)) || - (src_x_tz.size() == 4 && - x->format() != (format = MKLDNNMemoryFormat::nchw)) || - (src_x_tz.size() == 5 && - x->format() != (format = MKLDNNMemoryFormat::ncdhw))) { - _x.Resize(x_dims); - - mkldnn::memory::data_type in_type = platform::MKLDNNGetDataType(); - auto out_format = platform::MKLDNNFormatForSize( - x_dims.size(), MKLDNNMemoryFormat::nchw); - - const std::string key = - platform::CreateKey(src_x_tz, x->format(), out_format, in_type); - - platform::ReorderMKLDNNHandler handler(src_x_tz, x->type(), in_type, - dev_ctx, mkldnn_engine, key); - - auto user_x_memory_p = handler.AcquireSrcMemory( - x->format(), paddle::platform::to_void_cast(x_data)); - - auto x_memory_p = - handler.AcquireDstMemory(&_x, out_format, ctx.GetPlace()); - - auto x_reorder = handler.AcquireReorder(x_memory_p, user_x_memory_p); - - x_reorder->execute(astream, *user_x_memory_p, *x_memory_p); - astream.wait(); - } else { - format = x->format(); - _x.ShareDataWith(*x); - } - - z->mutable_data(ctx.GetPlace()); - auto sum_func = [](T a, T b) -> T { return a + b; }; - TransformFunctor - functor( - &_x, y, z, - ctx.template device_context(), - sum_func); + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor")); - axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); - PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), - "Axis should be in range [0, x_dims)"); + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for Y tensor")); + PADDLE_ENFORCE_NE( + y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for Y tensor")); - auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); - axis = (y_dims.size() == 0) ? x_dims.size() : axis; - - int pre, n, post, is_run_common_broadcast; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, - &is_run_common_broadcast); - - if (post == 1) { - functor.RunRowWise(n, pre); - } else { - functor.RunMidWise(n, pre, post); - } - z->set_layout(DataLayout::kMKLDNN); - z->set_format(format); - } else { - PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, - "Wrong layout set for X tensor"); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, - "Wrong format set for X tensor"); - - PADDLE_ENFORCE_EQ(y->layout(), DataLayout::kMKLDNN, - "Wrong layout set for Y tensor"); - PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, - "Wrong format set for Y tensor"); - - auto src_x_tz = framework::vectorize(x_dims); - auto src_y_tz = framework::vectorize(y_dims_untrimed); - auto dst_tz = framework::vectorize(z_dims); - - std::vector scales = {1.0f, 1.0f}; - - const std::string key = - platform::CreateKey(src_x_tz, ctx.OutputName("Out")); - - platform::SumMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); - - auto src_x_memory = handler.AcquireSrcMemory( - {{src_x_tz}, platform::MKLDNNGetDataType(), x->format()}, - paddle::platform::to_void_cast(x_data)); + const T* x_data = x->data(); + const T* y_data = y->data(); - auto src_y_memory = handler.AcquireSecondSrcMemory( - {{src_y_tz}, platform::MKLDNNGetDataType(), y->format()}, - paddle::platform::to_void_cast(y_data)); + auto src_x_tz = framework::vectorize(x->dims()); + auto src_y_tz = framework::vectorize(y->dims()); + auto dst_tz = framework::vectorize(z->dims()); - auto dst_md = memory::desc({dst_tz}, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - - auto sum_pd = handler.AcquireSumPrimitiveDescriptor( - {src_x_memory, src_y_memory}, scales, dst_md); + std::vector scales = {1.0f, 1.0f}; - T* z_data = - z->mutable_data(ctx.GetPlace(), sum_pd->dst_desc().get_size()); + const std::string key = + platform::CreateKey(src_x_tz, ctx.OutputName("Out")); - auto dst_memory = handler.AcquireDstMemoryFromPrimitive(z_data); + platform::SumMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); - auto sum_prim = handler.AcquireSum(); + auto src_x_memory = handler.AcquireSrcMemory( + {{src_x_tz}, platform::MKLDNNGetDataType(), x->format()}, + paddle::platform::to_void_cast(x_data)); + auto src_y_memory = handler.AcquireSecondSrcMemory( + {{src_y_tz}, platform::MKLDNNGetDataType(), y->format()}, + paddle::platform::to_void_cast(y_data)); + auto dst_md = memory::desc({dst_tz}, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + auto sum_pd = handler.AcquireSumPrimitiveDescriptor( + {src_x_memory, src_y_memory}, scales, dst_md); + T* z_data = + z->mutable_data(ctx.GetPlace(), sum_pd->dst_desc().get_size()); + auto dst_memory = handler.AcquireDstMemoryFromPrimitive(z_data); + auto sum_prim = handler.AcquireSum(); - sum_prim->execute(astream, {{MKLDNN_ARG_MULTIPLE_SRC, *src_x_memory}, - {MKLDNN_ARG_MULTIPLE_SRC + 1, *src_y_memory}, - {MKLDNN_ARG_DST, *dst_memory}}); - astream.wait(); + mkldnn::stream astream(mkldnn_engine); + sum_prim->execute(astream, {{MKLDNN_ARG_MULTIPLE_SRC, *src_x_memory}, + {MKLDNN_ARG_MULTIPLE_SRC + 1, *src_y_memory}, + {MKLDNN_ARG_DST, *dst_memory}}); + astream.wait(); - z->set_layout(DataLayout::kMKLDNN); - z->set_format(platform::GetMKLDNNFormat(*dst_memory)); - } + z->set_layout(DataLayout::kMKLDNN); + z->set_format(platform::GetMKLDNNFormat(*dst_memory)); } }; @@ -184,40 +106,23 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - // skip out, x, y, - // dout length is larger or equal than dx, dy. - auto* out = dout; - auto *x = dout, *y = dout; auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { in->set_layout(DataLayout::kMKLDNN); in->set_format(out->format()); }; - if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { - if (dx->dims() == dy->dims()) { - auto blas = math::GetBlas(ctx); - if (dx) { - blas.VCOPY(dout->numel(), dout->data(), - dx->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dx, dout); - } - - if (dy) { - blas.VCOPY(dout->numel(), dout->data(), - dy->mutable_data(ctx.GetPlace())); - set_mkldnn_format(dy, dout); - } - } - } else { - // Execute default kernel when broadcast is needed - x = ctx.Input("X"); - y = ctx.Input("Y"); - ElemwiseExplicitGradCompute, IdentityGrad>( - ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), - IdentityGrad()); + auto blas = math::GetBlas(ctx); + if (dx) { + blas.VCOPY(dout->numel(), dout->data(), + dx->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dx, dout); + } + + if (dy) { + blas.VCOPY(dout->numel(), dout->data(), + dy->mutable_data(ctx.GetPlace())); + set_mkldnn_format(dy, dout); } } }; diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py index e2e932a8a5c06dcf3843ad87bd88e87cac662bd0..f98336bb2c9b8af2500fafe9a620919e16a7bbdf 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py @@ -15,121 +15,38 @@ from __future__ import print_function import unittest import numpy as np -import paddle.fluid.core as core -from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci from paddle.fluid.tests.unittests.test_elementwise_add_op import * ''' -Some tests differ from the tests defined in test_elementwise_add_op.py -because MKLDNN does not support tensors of number of dimensions 3. +MKLDNN does not support tensors of dimensions number equal to 3. Such dimensions cause exceptions in MKLDNN reorder primitive. +The DNNL-based kernel is used only when broadcasting is not required +(see GetExpectedKernelType() methods in elementwise_add_op.h). ''' class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp): - def init_input_output(self): - self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) - self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) - self.out = np.add(self.x, self.y) - - def init_kernel_type(self): - self.use_mkldnn = True - - -@skip_check_grad_ci( - reason="[skip shape check] Use y_shape(1) to test broadcast.") -class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar): - def init_input_output(self): - self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) - self.y = np.random.rand(1).astype(self.dtype) - self.out = self.x + self.y - - def init_kernel_type(self): - self.use_mkldnn = True - - -@skip_check_grad_ci( - reason="[skip shape check] Use y_shape(1,1) to test broadcast.") -class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2): - def init_input_output(self): - self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype) - self.y = np.random.rand(1, 1).astype(self.dtype) - self.out = self.x + self.y + def init_data_format(self): + self.data_format = 'MKLDNN' def init_kernel_type(self): self.use_mkldnn = True - -class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector): - def init_kernel_type(self): - self.use_mkldnn = True + def init_dtype(self): + self.dtype = np.float32 -class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0): +class TestMKLDNNElementwiseAddOp2(TestMKLDNNElementwiseAddOp): def init_input_output(self): - self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype) - self.y = np.random.rand(100).astype(self.dtype) - self.out = self.x + self.y.reshape(100, 1, 1, 1) - - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1): - def init_input_output(self): - self.x = np.random.rand(2, 100, 3, 4).astype(self.dtype) - self.y = np.random.rand(100).astype(self.dtype) - self.out = self.x + self.y.reshape(1, 100, 1, 1) - - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2): - def init_input_output(self): - self.x = np.random.rand(2, 2, 3, 100).astype(self.dtype) - self.y = np.random.rand(100).astype(self.dtype) - self.out = self.x + self.y.reshape(1, 1, 1, 100) - - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4): - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNElementwiseAddOp_rowwise_add_0( - TestElementwiseAddOp_rowwise_add_0): - def init_input_output(self): - self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype) - self.y = np.random.rand(10, 12).astype(self.dtype) - self.out = self.x + self.y.reshape(1, 10, 12, 1) - - def init_kernel_type(self): - self.use_mkldnn = True - - -class TestMKLDNNElementwiseAddOp_rowwise_add_1( - TestElementwiseAddOp_rowwise_add_1): - def init_kernel_type(self): - self.use_mkldnn = True + self.x = np.random.random((100, )).astype(self.dtype) + self.y = np.random.random((100, )).astype(self.dtype) + self.out = np.add(self.x, self.y) -class TestMKLDNNElementwiseAddOp_channelwise_add( - TestElementwiseAddOp_channelwise_add): +class TestMKLDNNElementwiseAddOp3(TestMKLDNNElementwiseAddOp): def init_input_output(self): - self.x = np.random.rand(100, 2, 3, 3).astype(self.dtype) - self.y = np.random.rand(100, 1, 1, 1).astype(self.dtype) - self.out = self.x + self.y - - def init_kernel_type(self): - self.use_mkldnn = True + self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.out = np.add(self.x, self.y) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index 73857b9e6d7a340c975004763640ce7061c64ebd..db42ff5e58bb07027bcf9f7785cd64ae50318d36 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -27,8 +27,6 @@ class TestElementwiseAddOp(OpTest): def setUp(self): self.op_type = "elementwise_add" - self.dtype = np.float64 - self.axis = -1 self.init_dtype() self.init_input_output() self.init_kernel_type() @@ -78,10 +76,10 @@ class TestElementwiseAddOp(OpTest): self.out = np.add(self.x, self.y) def init_dtype(self): - pass + self.dtype = np.float64 def init_axis(self): - pass + self.axis = -1 @unittest.skipIf(not core.is_compiled_with_cuda(),