diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index ac0bf15aeb99a3b15080d928d04c58db8795d06d..57996477e38a972b2e2d7472c6ac60173b35a7b5 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -25,11 +25,3 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_add>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_add_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc deleted file mode 100644 index d527a078c658c6fc5ef5754a1b176ffbe70dee58..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL(elementwise_div, - MKLDNN, - paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_div_grad, - MKLDNN, - paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index afebc5c6e322a818d743e9a3d0c0e8abeb819b89..ba3a0d87f6cf74156be6491550707d46688f8956 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -25,11 +25,3 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_mul>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_mul_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index c0eb9b657dbf7880a4072995bd78e36cef386acd..91660b79b09ac3e164c9da676d6a8d10a9695c2a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -25,11 +25,3 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_sub>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_sub_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc index c6031b34af249c6e054a27d22d0f726ea0ea91cb..c0b99b8ddf03684a433a00d0f69e468b0069ce5e 100644 --- a/paddle/phi/kernels/elementwise_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -414,3 +414,8 @@ PD_REGISTER_KERNEL(elementwise_pow, float, phi::dtype::float16) {} #endif + +#if defined PADDLE_WITH_MKLDNN +PD_REGISTER_KERNEL( + divide, OneDNN, ONEDNN, phi::DivideKernel, float, phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c65c373dedb1b638a4f3fbc2d3e9d15241f9386 --- /dev/null +++ b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/elementwise_add_grad_kernel.h" +#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h" +#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h" +#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace funcs { + +inline std::vector CalculateBroadcastedDims( + const phi::DenseTensor* x, const phi::DenseTensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + std::vector dst_tz_ex(src_tz.size(), 1); + + if (src_tz.size() == dst_tz.size()) { + for (size_t i = 0; i < src_tz.size(); i++) { + dst_tz_ex[i] = (src_tz[i] == dst_tz[i]) ? dst_tz[i] : 1; + } + } else { + size_t j = 0; + for (size_t i = 0; i < src_tz.size(); i++) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + } + + return dst_tz_ex; +} + +inline void AddSubNonBroadcast(ReorderOneDNNHandler* reorder_handler, + phi::DenseTensor* grad_tensor, + const std::shared_ptr& src_memory, + const std::shared_ptr& dst_memory, + const std::vector& scales) { + dnnl::primitive_attr reorder_attr; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = + reorder_handler->AcquireReorder(dst_memory, src_memory, reorder_attr); + + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + + reorder_p->execute( + OneDNNContext::tls().get_stream(), *src_memory, *dst_memory); +} + +template +inline void BroadcastReduction(const Place& place, + const dnnl::engine& onednn_engine, + phi::DenseTensor* grad_tensor, + const phi::DenseTensor* dout, + const std::shared_ptr& src_memory, + std::shared_ptr dst_memory, + const std::vector& scales, + const bool is_sub) { + dnnl::primitive_attr broadcast_reduction_attr; + + // Broadcasting + if (is_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0); + broadcast_reduction_attr.set_post_ops(po); + } + + ReductionOneDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, + 0.0f, + 0.0f, + onednn_engine, + place, + dout, + grad_tensor, + CalculateBroadcastedDims(dout, grad_tensor), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(grad_tensor); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + auto astream = OneDNNContext::tls().get_stream(); + reduction_p->execute(astream, + { + {DNNL_ARG_SRC, *src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + grad_tensor->set_mem_desc(dst_memory->get_desc().reshape( + phi::vectorize(grad_tensor->dims()))); +} + +} // namespace funcs + +template +void ElementwiseGradKernel(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor* out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + const auto& onednn_engine = dev_ctx.GetEngine(); + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + bool swap_x_y = false; + auto* non_const_x = &x; + auto* non_const_y = &y; + if (x.numel() < y.numel()) { + std::swap(non_const_x, non_const_y); + std::swap(dx, dy); + swap_x_y = true; + } + + std::vector scales{1.0}; + if (swap_x_y) { + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + } + + auto tz = phi::vectorize(dout.dims()); + + funcs::ReorderOneDNNHandler reorder_handler( + tz, dout.dtype(), funcs::ToOneDNNDataType(dout.dtype()), onednn_engine); + + auto reorder_src_memory = reorder_handler.AcquireSrcMemory( + dout.mem_desc(), funcs::to_void_cast(dout.data())); + + std::shared_ptr dst_memory; + std::shared_ptr broadcast_src_memory = reorder_src_memory; + + auto& astream = OneDNNContext::tls().get_stream(); + if (dx) { + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout.dims() == dx->dims()) { + dst_memory = reorder_handler.AcquireDstMemory( + dx, dout.mem_desc(), dev_ctx.GetPlace()); + AddSubNonBroadcast( + &reorder_handler, dx, reorder_src_memory, dst_memory, scales); + } + } else { // elementwise_mul & elementwise_div + funcs::BinaryOneDNNHandler binary_handler(BINARY_OP, + axis, + onednn_engine, + dev_ctx.GetPlace(), + &dout, + non_const_y, + dx, + 1.0f, + 1.0f, + 1.0f, + false); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(&dout); + const auto src_y_memory = + binary_handler.AcquireSecondSrcMemory(non_const_y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + if (dout.dims() != dx->dims()) { + funcs::BroadcastReduction(dev_ctx.GetPlace(), + onednn_engine, + dx, + &dout, + broadcast_src_memory, + dst_memory, + scales, + BINARY_OP == dnnl::algorithm::binary_sub); + } else { + dx->set_mem_desc(dst_memory->get_desc()); + } + } + + if (dy) { + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout.dims() == dy->dims()) { + dst_memory = reorder_handler.AcquireDstMemory( + dy, dout.mem_desc(), dev_ctx.GetPlace()); + AddSubNonBroadcast( + &reorder_handler, dy, reorder_src_memory, dst_memory, scales); + } + } else { // elementwise_mul & elementwise_div + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + funcs::BinaryOneDNNHandler binary_handler(dnnl::algorithm::binary_mul, + axis, + onednn_engine, + dev_ctx.GetPlace(), + &dout, + non_const_x, + nullptr, + 1.0f, + 1.0f, + 1.0f, + false); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(non_const_x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + funcs::BinaryOneDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, + axis, + onednn_engine, + dev_ctx.GetPlace(), + non_const_y, + non_const_y, + nullptr, + 1.0f, + 1.0f, + 1.0f, + false); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(non_const_y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = + funcs::BinaryOneDNNHandler(dnnl::algorithm::binary_mul, + axis, + onednn_engine, + dev_ctx.GetPlace(), + &dout, + out, + nullptr, + -1.0f, + 1.0f, + 1.0f, + false, + po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(&dout); + + const auto dst_dy_memory = (dout.dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + + if (dout.dims() != dy->dims()) { + funcs::BroadcastReduction(dev_ctx.GetPlace(), + onednn_engine, + dy, + &dout, + broadcast_src_memory, + dst_memory, + scales, + BINARY_OP == dnnl::algorithm::binary_sub); + } else { + dy->set_mem_desc(dst_memory->get_desc()); + } + } +} + +#define DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(name, algorithm) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + const DenseTensor& dout, \ + int axis, \ + DenseTensor* dx, \ + DenseTensor* dy) { \ + ElementwiseGradKernel( \ + dev_ctx, x, y, nullptr, dout, axis, dx, dy); \ + } + +DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(Add, dnnl::algorithm::binary_add) +DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(Subtract, dnnl::algorithm::binary_sub) +DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(Multiply, dnnl::algorithm::binary_mul) + +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + ElementwiseGradKernel( + dev_ctx, x, y, &out, dout, axis, dx, dy); +} +} // namespace phi + +PD_REGISTER_KERNEL( + add_grad, OneDNN, ONEDNN, phi::AddGradKernel, float, phi::dtype::bfloat16) { +} + +PD_REGISTER_KERNEL(subtract_grad, + OneDNN, + ONEDNN, + phi::SubtractGradKernel, + float, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(multiply_grad, + OneDNN, + ONEDNN, + phi::MultiplyGradKernel, + float, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(divide_grad, + OneDNN, + ONEDNN, + phi::DivideGradKernel, + float, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/onednn/elementwise_kernel.cc b/paddle/phi/kernels/onednn/elementwise_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..51be7559772d7261821a0f2fc9b78c59e2288887 --- /dev/null +++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/elementwise_add_kernel.h" +#include "paddle/phi/kernels/elementwise_divide_kernel.h" +#include "paddle/phi/kernels/elementwise_multiply_kernel.h" +#include "paddle/phi/kernels/elementwise_subtract_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void ElementwiseKernel(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + const auto& onednn_engine = dev_ctx.GetEngine(); + + float scale_x = dev_ctx.HasDnnAttr("Scale_x") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x")) + : 1; + float scale_y = dev_ctx.HasDnnAttr("Scale_y") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y")) + : 1; + float scale_out = + dev_ctx.HasDnnAttr("Scale_out") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out")) + : 1; + + dnnl::post_ops post_operations; + funcs::AppendActivation(dev_ctx, post_operations); + + auto* non_const_x = &x; + auto* non_const_y = &y; + + funcs::BinaryOneDNNHandler handler(BINARY_OP, + axis, + onednn_engine, + dev_ctx.GetPlace(), + non_const_x, + non_const_y, + out, + scale_x, + scale_y, + scale_out, + true, + post_operations); + + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + if (x.numel() < y.numel()) { + std::swap(non_const_x, non_const_y); + } + + const auto src_x_memory = handler.AcquireSrcMemory(non_const_x); + const auto src_y_memory = handler.AcquireSecondSrcMemory(non_const_y); + // (jczaja) For Inplace src and dst should be the same memory object. + // So x should share buffer with z. But UT mechanics is testing inplace + // execution for this op not checking that x can be bradcasted to match in + // shape y tensor. + // This is wrong as when x is to be broadcasted then z(out) will match the + // shape of y which is bigger than x. Hence if x is smaller in shape than z + // and they share a buffer (of + // shape x) then this buffer is not big enough to hold result of elementwise + // operation. + const bool reuse_x_memory = non_const_x->numel() == out->numel() && + non_const_x->IsSharedBufferWith(*out); + std::shared_ptr dst_memory; + + if (reuse_x_memory) { + dst_memory = src_x_memory; + // NOTE(chenfeiyu): when the output reuses memory from other tensor rather + // than allocate its own, it's still need to take care of its data type. + // Unfortunately, paddle's operator only infers the output' shape, but not + // the data type. Alloc takes care of allocation and data type + // normally, but if the memory is already allocated and there is no need + // to re-allocate, it just set the data type. So this it added there to + // get the right data type. + dev_ctx.template Alloc(out); + } else { + dst_memory = handler.AcquireDstMemory(out); + } + + const auto binary_prim = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_x_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + astream.wait(); + + if (handler.use_broadcasting_hack == false) { + out->set_mem_desc(dst_memory->get_desc()); + } else { + auto dims = dst_memory->get_desc().dims(); + dims.insert(dims.begin(), non_const_x->dims()[0]); + dims[1] /= dims[0]; + out->set_mem_desc(dst_memory->get_desc().reshape(dims)); + } +} + +#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm) \ + template \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + ElementwiseKernel(dev_ctx, x, y, axis, out); \ + } + +DEFINE_ONEDNN_ELEMENTWISE_KERNEL(Divide, dnnl::algorithm::binary_div) + +} // namespace phi + +PD_REGISTER_KERNEL(divide_raw, + OneDNN, + ONEDNN, + phi::DivideRawKernel, + float, + phi::dtype::bfloat16) {}