diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 42d749b7b8e3e4cfb2cd0f222321f76ae08c2e3c..85b163ecffbe167fdfeee70c483883421e80a0f9 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -149,12 +149,20 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); auto* dout = ctx.Input(framework::GradVarName("Out")); + VLOG(4) << "element sub: dx " << dx << " dy " << dy << " dout " << dout; // oneDNN's binary is optimized for broadcasting y into x, so in other case // we have to swap tensors to achieve optimal performance + bool swap_x_y = false; if (x->numel() < y->numel()) { std::swap(x, y); std::swap(dx, dy); + swap_x_y = true; + } + + std::vector scales{1.0}; + if (swap_x_y) { + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; } int axis = ctx.Attr("axis"); @@ -172,7 +180,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { dout->mem_desc(), platform::to_void_cast(dout->data())); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - if (dx) { std::shared_ptr dst_memory; @@ -181,8 +188,11 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { BINARY_OP == dnnl::algorithm::binary_sub) { dst_memory = reorder_handler.AcquireDstMemory( dx, dout->mem_desc(), ctx.GetPlace()); - auto reorder_p = - reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); + + dnnl::primitive_attr reorder_attr; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = reorder_handler.AcquireReorder( + dst_memory, reorder_src_memory_p, reorder_attr); platform::RecordEvent record_reorder( "int_reorder", platform::TracerEventType::UserDefined, @@ -190,6 +200,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); + } else { // elementwise_mul & elementwise_div platform::BinaryMKLDNNHandler binary_handler(BINARY_OP, axis, @@ -233,11 +244,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { dy, dout->mem_desc(), ctx.GetPlace()); dnnl::primitive_attr reorder_attr; - std::vector scales(1); - scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; reorder_attr.set_output_scales(0, scales); - auto reorder_p = std::make_shared( - *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + + auto reorder_p = reorder_handler.AcquireReorder( + reorder_dst_memory_p, reorder_src_memory_p, reorder_attr); platform::RecordEvent record_reorder( "int_reorder", platform::TracerEventType::UserDefined, @@ -331,7 +341,8 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { // Broadcasting if (BINARY_OP == dnnl::algorithm::binary_sub) { dnnl::post_ops po; - po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + po.append_eltwise( + 1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0); broadcast_reduction_attr.set_post_ops(po); } diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index 66376dd883543168bfaf82d29773e8a29524cd33..f08a60aa02a66c61a5b98def65c92b53f57403f4 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -840,6 +840,10 @@ class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT { CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); if (x->numel() < y->numel()) { + if (algo == dnnl::algorithm::binary_sub) { + attributes = CreateAttributes( + algo, -1.0 * scale_x, -1.0 * scale_y, scale_out, post_ops); + } this->AcquireForwardPrimitiveDescriptor( attributes, algo, src1_md, src0_md, dst_md); } else { diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py index e70cc8e3779672ebdd25bc6d3e76eb5f2eefbc4c..6029c7773303080a0d163d38785da818c256e998 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py @@ -20,6 +20,8 @@ from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_flo from paddle.fluid.framework import _current_expected_place import paddle.fluid.core as core +import sys + @OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)), "GPU is not supported") @@ -108,6 +110,23 @@ class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp): self.axis = 1 +class TestMKLDNNElementwiseSubOp40(TestMKLDNNElementwiseSubOp): + + def init_input_output(self): + self.x = np.random.uniform(0.1, 2, [180, 1]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [1, 256]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ignore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ignore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp): def init_input_output(self):