From e92e6b06972d97e68a5c4ab2cccb8d6bfe7e42f5 Mon Sep 17 00:00:00 2001 From: piotrekobiIntel Date: Wed, 27 Oct 2021 10:10:28 +0200 Subject: [PATCH] Added fp32 / bf16 forward and backward elementwise_div_mkldnn operator (#36158) * Add WIP version of elementwise_div_mkldnn without working dy grad * Add dy gradient calculation implementation, disable broadcast tests * Readd removed tests from static_mode_white_list * Add bfloat16 gradient tests, remove int8 and uint8 support * - Change the way dy grad is calculated to improve performance - Refactor BinaryMKLDNNHandler to use a default parameter * Change copyright year * Refactor as suggested * Attempt to bypass CI Approval not accepting max_relative_error * Fix formatting issue --- .../mkldnn/elementwise_div_mkldnn_op.cc | 147 ++++++++++++++ paddle/fluid/platform/mkldnn_reuse.h | 6 +- .../mkldnn/test_elementwise_div_mkldnn_op.py | 179 ++++++++++++++++++ tools/static_mode_white_list.py | 1 + 4 files changed, 331 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc new file mode 100644 index 00000000000..c037daba0ee --- /dev/null +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -0,0 +1,147 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" + +namespace paddle { +namespace framework { +class ExecutionContext; +} // namespace framework +namespace platform { +class CPUDeviceContext; +struct CPUPlace; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + // dx = dout / y + + platform::BinaryMKLDNNHandler handler( + dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), + dout, y, dx, 1.0f, 1.0f, 1.0f); + + const auto src_dout_memory = handler.AcquireSrcMemory(dout); + const auto src_y_memory = handler.AcquireSecondSrcMemory(y); + const auto dst_dx_memory = handler.AcquireDstMemory(dx); + + const auto binary_prim = handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_dx_memory}}; + + binary_prim->execute(astream, args); + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); + } + + if (dy) { + // dy = -dout * out / y + + platform::BinaryMKLDNNHandler y_handler( + dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y, + y, nullptr, 1.0f, 1.0f, 1.0f); + + const auto y_memory = y_handler.AcquireSrcMemory(y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc()); + + platform::BinaryMKLDNNHandler handler( + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); + + const auto src_dout_memory = handler.AcquireSrcMemory(dout); + const auto src_out_memory = handler.AcquireSecondSrcMemory(out); + + // If broadcasting is in use then let's write to temporary + // buffer allocated by oneDNN + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? handler.AcquireDstMemory(dy) + : handler.AcquireDstMemory(); + + const auto binary_prim = handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_out_memory}, + {DNNL_ARG_DST, *dst_dy_memory}, + {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}}; + + binary_prim->execute(astream, args); + astream.wait(); + + dy->set_layout(framework::DataLayout::kMKLDNN); + + // Reduction is needed for broadcasting scenario + if (dout->dims() != dy->dims()) { + platform::ReductionMKLDNNHandler handler_sum( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); + auto dy_memory_p = handler_sum.AcquireDstMemory(dy); + auto reduction_p = handler_sum.AcquireForwardPrimitive(); + + // As source we use mem object with results from binary operation + reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, + {DNNL_ARG_DST, *dy_memory_p}}); + astream.wait(); + dy->set_format( + platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( + framework::vectorize(dy->dims())))); + + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +// TODO(piotrekobi) add int8, uint8 support +REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseDivMKLDNNGradKernel, + ops::EltwiseDivMKLDNNGradKernel) diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 084b47bb3c7..2ab2de1c1f9 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -614,7 +614,8 @@ class BinaryMKLDNNHandler BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z) + float scale_x, float scale_y, float scale_z, + const dnnl::post_ops& post_ops = dnnl::post_ops()) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { PADDLE_ENFORCE_EQ( x->layout(), DataLayout::kMKLDNN, @@ -663,10 +664,11 @@ class BinaryMKLDNNHandler MKLDNNMemoryFormat::any); auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + attributes.set_post_ops(post_ops); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, dst_md); } - std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py new file mode 100644 index 00000000000..a3c41d2f034 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_div_mkldnn_op.py @@ -0,0 +1,179 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +from paddle import enable_static +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 +from paddle.fluid.framework import _current_expected_place +import paddle.fluid.core as core + + +@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)), + "GPU is not supported") +class TestMKLDNNElementwiseDivOp(OpTest): + def setUp(self): + self.op_type = "elementwise_div" + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.divide(self.x, self.y) + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', None, 0.005, False, 0.02) + + def test_check_grad_ignore_x(self): + self.check_grad(['Y'], 'Out', set("X"), 0.005, False, 0.02) + + def test_check_grad_ignore_y(self): + self.check_grad(['X'], 'Out', set('Y'), 0.005, False, 0.02) + + def init_axis(self): + self.axis = -1 + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output() + + +class TestMKLDNNElementwiseDivOp2(TestMKLDNNElementwiseDivOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [100]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype) + self.out = np.divide(self.x, self.y) + + +class TestMKLDNNElementwiseDivOp3(TestMKLDNNElementwiseDivOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.out = np.divide(self.x, self.y) + + +class TestMKLDNNElementwiseDivOp4(TestMKLDNNElementwiseDivOp): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype) + self.out = np.divide(self.x, self.y) + + # TODO(piotrekobiIntel): Enable when grad is ready + def test_check_grad_normal(self): + pass + + def test_check_grad_ignore_x(self): + pass + + +class TestMKLDNNElementwiseDivOp5(TestMKLDNNElementwiseDivOp): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [100]).astype(self.dtype) + self.out = np.divide(self.x, self.y) + + # TODO(piotrekobiIntel): Enable when grad is ready + def test_check_grad_normal(self): + pass + + def test_check_grad_ignore_x(self): + pass + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestBf16(TestMKLDNNElementwiseDivOp): + def setUp(self): + self.op_type = "elementwise_div" + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.x_bf16 = convert_float_to_uint16(self.x) + self.y_bf16 = convert_float_to_uint16(self.y) + self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + + def init_dtype(self): + self.dtype = np.float32 + self.mkldnn_data_type = "bfloat16" + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [100]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype) + self.out = np.divide(self.x, self.y) + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad_normal(self): + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + user_defined_grads=[ + np.divide(self.x, self.y), np.divide( + (np.multiply(-self.x, self.x)), np.multiply(self.y, self.y)) + ], + user_defined_grad_outputs=[self.x_bf16]) + + def test_check_grad_ignore_x(self): + self.check_grad_with_place( + core.CPUPlace(), ["Y"], + "Out", + user_defined_grads=[ + np.divide((np.multiply(-self.x, self.y)), + np.multiply(self.y, self.y)) + ], + user_defined_grad_outputs=[self.y_bf16]) + + def test_check_grad_ignore_y(self): + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + user_defined_grads=[np.divide(self.x, self.y)], + user_defined_grad_outputs=[self.x_bf16]) + + +class TestBf16Broadcasting(TestBf16): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [100]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def test_check_grad_normal(self): + pass + + def test_check_grad_ignore_x(self): + pass + + +if __name__ == '__main__': + enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 7d0a2a8953f..8705e29cbb2 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -610,6 +610,7 @@ STATIC_MODE_TESTING_LIST = [ 'test_dequantize_mkldnn_op', 'test_elementwise_add_mkldnn_op', 'test_elementwise_add_bf16_mkldnn_op', + 'test_elementwise_div_mkldnn_op', 'test_elementwise_sub_mkldnn_op', 'test_elementwise_mul_mkldnn_op', 'test_elementwise_mul_bf16_mkldnn_op', -- GitLab