From 12d8a567b5bfecd284ff856f7471699ed3da0af7 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Mon, 30 May 2022 19:25:19 +0200 Subject: [PATCH] OneDNN md-in-tensor refactoring part 5: Memory descriptor enabled for elementwises, reductions and expand_v2 ops (#43036) * enabled md in elementwises, reductions and expand_v2 * CI fix for invalid numpy copy * fixed formatting * CI rerun * changes after review --- .../mkldnn/elementwise_mkldnn_op.h | 19 +++--- .../operators/mkldnn/expand_v2_mkldnn_op.cc | 21 +++--- .../reduce_ops/mkldnn/reduce_mkldnn_op.h | 65 ++++++++---------- paddle/fluid/platform/mkldnn_reuse.h | 68 +++++++------------ .../unittests/mkldnn/test_reduce_mkldnn_op.py | 13 +++- 5 files changed, 82 insertions(+), 104 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index d1a1aa3008..070bf9511a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -145,8 +145,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { binary_prim->execute(astream, args); astream.wait(); - z->set_layout(DataLayout::kMKLDNN); - z->set_format(platform::GetMKLDNNFormat(*dst_memory)); + z->set_mem_desc(dst_memory->get_desc()); } }; @@ -179,7 +178,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { onednn_engine); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); + dout->mem_desc(), platform::to_void_cast(dout->data())); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -189,7 +188,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { // elementwise_add & elementwise_sub if (BINARY_OP == dnnl::algorithm::binary_add || BINARY_OP == dnnl::algorithm::binary_sub) { - dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(), + dst_memory = reorder_handler.AcquireDstMemory(dx, dout->mem_desc(), ctx.GetPlace()); auto reorder_p = reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); @@ -218,8 +217,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { } astream.wait(); - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_memory)); + dx->set_mem_desc(dst_memory->get_desc()); } if (dy) { @@ -232,7 +230,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { BINARY_OP == dnnl::algorithm::binary_sub) { if (dout->dims() == dy->dims()) { auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( - dy, dout->format(), ctx.GetPlace()); + dy, dout->mem_desc(), ctx.GetPlace()); dnnl::primitive_attr reorder_attr; std::vector scales(1); @@ -301,7 +299,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { dst_memory = dst_dy_memory; } astream.wait(); - dy->set_layout(DataLayout::kMKLDNN); if (dout->dims() != dy->dims()) { // Broadcasting @@ -324,10 +321,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { {DNNL_ARG_DST, *dst_memory}, }); astream.wait(); - dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape( - phi::vectorize(dy->dims())))); + dy->set_mem_desc(dst_memory->get_desc().reshape( + phi::vectorize(dy->dims()))); } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_memory)); + dy->set_mem_desc(dst_memory->get_desc()); } } } diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc index 05d6bae5f7..91dccbee0a 100644 --- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc @@ -45,19 +45,17 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel { out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i]; } - dnnl::memory::desc x_mem_desc = x->mem_desc(); if (x_vec_dims.size() != out_new_dims.size()) { - x_mem_desc = GetExtendedMemoryDescriptor(x_mem_desc, x_vec_dims, - out_new_dims.size()); + x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size()); } out->Resize(phi::make_ddim(out_new_dims)); paddle::platform::BroadcastDataMKLDNNHandler handler( - dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), out, x, - 0.0f, 1.0f, x_mem_desc); + dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), x, out, + 0.0f, 1.0f, x_vec_dims); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = handler.AcquireDstMemory(out); // acquires zeroed mem + auto dst_memory_p = handler.AcquireZeroedDstMemory(out); auto binary_p = handler.AcquireForwardPrimitive(); const std::unordered_map args = { @@ -73,14 +71,13 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel { } private: - dnnl::memory::desc GetExtendedMemoryDescriptor( - const dnnl::memory::desc& x_mem_desc, - const std::vector& x_vec_dims, int new_size) const { - std::vector new_dims(new_size, 1); + std::vector GetExtendedXDims(const std::vector& x_vec_dims, + int new_size) const { + std::vector extended_x_dims(new_size, 1); std::copy(x_vec_dims.begin(), x_vec_dims.end(), - new_dims.begin() + new_size - x_vec_dims.size()); + extended_x_dims.begin() + new_size - x_vec_dims.size()); - return x_mem_desc.reshape(new_dims); + return extended_x_dims; } }; diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h index 0c174b0825..94d8cc41d3 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h @@ -29,11 +29,11 @@ inline std::vector CalculateReducedDims( bool reduce_all, bool keep_dim) { if (keep_dim) return phi::vectorize(output->dims()); - if (reduce_all) - return std::vector(phi::vectorize(input->dims()).size(), 1); + if (reduce_all) return std::vector(input->dims().size(), 1); std::vector output_dims(phi::vectorize(input->dims())); for (size_t i = 0; i < reduce_dims.size(); ++i) { + // handle negative dims, f.e. "-1" means rightmost dimension reduce_dims[i] = (reduce_dims[i] >= 0) ? reduce_dims[i] : input->dims().size() + reduce_dims[i]; @@ -52,16 +52,16 @@ class ReduceMKLDNNKernel : public framework::OpKernel { ctx.template device_context(); const auto& onednn_engine = dev_ctx.GetEngine(); - const auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); auto reduce_dims = ctx.Attr>("dim"); bool reduce_all = ctx.Attr("reduce_all"); bool keep_dim = ctx.Attr("keep_dim"); - auto output_dims = - CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim); - auto input_dims = phi::vectorize(input->dims()); + auto x_tz = phi::vectorize(x->dims()); + auto out_tz = + CalculateReducedDims(x, out, reduce_dims, reduce_all, keep_dim); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -69,18 +69,19 @@ class ReduceMKLDNNKernel : public framework::OpKernel { // copied without actual reduction. // In that case reorder must be executed to maintain compatibility with // PaddlePaddle reduce op - if (input_dims == output_dims) { - dnnl::memory::data_type input_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(input->dtype())); + if (x_tz == out_tz) { + dnnl::memory::data_type x_type = framework::ToMKLDNNDataType( + framework::TransToProtoVarType(x->dtype())); platform::ReorderMKLDNNHandler reorder_handler( - input_dims, framework::TransToProtoVarType(input->dtype()), - input_type, onednn_engine); + x_tz, framework::TransToProtoVarType(x->dtype()), x_type, + onednn_engine); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - input->mem_desc(), platform::to_void_cast(input->data())); + x->mem_desc(), platform::to_void_cast(x->data())); - auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( - output, input->mem_desc(), ctx.GetPlace()); + // reuse mem desc since it is a simple copy + auto reorder_dst_memory_p = + reorder_handler.AcquireDstMemory(out, x->mem_desc(), ctx.GetPlace()); auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, reorder_dst_memory_p); @@ -88,15 +89,15 @@ class ReduceMKLDNNKernel : public framework::OpKernel { reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); - output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape( - phi::vectorize(output->dims()))); + out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape( + phi::vectorize(out->dims()))); } else { platform::ReductionMKLDNNHandler handler(reduction_type, 0.0f, 0.0f, onednn_engine, ctx.GetPlace(), - input, output, output_dims); + x, out, out_tz); - auto src_memory_p = handler.AcquireSrcMemory(input); - auto dst_memory_p = handler.AcquireDstMemory(output); + auto src_memory_p = handler.AcquireSrcMemory(x); + auto dst_memory_p = handler.AcquireDstMemory(out); std::unordered_map reduction_args = { {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; @@ -105,8 +106,9 @@ class ReduceMKLDNNKernel : public framework::OpKernel { reduction_p->execute(astream, reduction_args); astream.wait(); - output->set_mem_desc(dst_memory_p->get_desc().reshape( - phi::vectorize(output->dims()))); + + out->set_mem_desc(dst_memory_p->get_desc().reshape( + phi::vectorize(out->dims()))); } } }; @@ -127,22 +129,15 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel { const auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - const auto input_dims = - CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim); - const auto output_dims = phi::vectorize(dx->dims()); - - auto dout_mem_desc = dout->mem_desc(); - - if (input_dims != output_dims) { - dout_mem_desc = dout_mem_desc.reshape(input_dims); - } + auto dout_tz = CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim); + auto dx_tz = phi::vectorize(dx->dims()); - platform::BroadcastDataMKLDNNHandler handler( - binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y, - dout_mem_desc); + platform::BroadcastDataMKLDNNHandler handler(binary_type, onednn_engine, + ctx.GetPlace(), dout, dx, + scale_x, scale_y, dout_tz); const auto src_memory_p = handler.AcquireSrcMemory(dout); - const auto dst_memory_p = handler.AcquireDstMemory(dx); + const auto dst_memory_p = handler.AcquireZeroedDstMemory(dx); const auto binary_prim = handler.AcquireForwardPrimitive(); const std::unordered_map args = { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 13b5005a30..5476d244f6 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -616,29 +616,17 @@ class BinaryMKLDNNHandler public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, const dnnl::engine engine, platform::Place cpu_place, - const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z, + const Tensor* x, const Tensor* y, Tensor* out, + float scale_x, float scale_y, float scale_out, const dnnl::post_ops& post_ops = dnnl::post_ops{}) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", - DataLayout::kMKLDNN, x->layout())); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", - DataLayout::kMKLDNN, y->layout())); - const auto src_x_tz = phi::vectorize(x->dims()); const auto src_y_tz = phi::vectorize(y->dims()); // if output tensor(z) is nullptr then we are computing into oneDNN // managed buffer auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : phi::vectorize(z->dims()); + const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : phi::vectorize(out->dims()); auto src0_md = x->mem_desc(); auto src1_md = y->mem_desc(); @@ -667,7 +655,7 @@ class BinaryMKLDNNHandler MKLDNNMemoryFormat::any); auto attributes = - CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops); + CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, dst_md); @@ -681,7 +669,7 @@ class BinaryMKLDNNHandler private: static inline dnnl::primitive_attr CreateAttributes( - dnnl::algorithm op, float scale_x, float scale_y, float scale_z, + dnnl::algorithm op, float scale_x, float scale_y, float scale_out, dnnl::post_ops post_ops = dnnl::post_ops{}) { // Scales set in attributes for inputs contibute to the output equation // in the following way (assuming no broadcasting takes place): @@ -699,9 +687,9 @@ class BinaryMKLDNNHandler // For mul operation on the other hand // output = (scale_out / scale_x) * x * (1.0 / scale_y) * y // - float scale_0 = scale_z / scale_x; + float scale_0 = scale_out / scale_x; float scale_1 = - op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y; + op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y; dnnl::primitive_attr attributes; attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0, {scale_0}); @@ -718,21 +706,15 @@ class BroadcastDataMKLDNNHandler public: BroadcastDataMKLDNNHandler(const dnnl::algorithm algo, const dnnl::engine engine, - platform::Place cpu_place, const Tensor* out, - const Tensor* x, float scale_x, float scale_y, - const dnnl::memory::desc& x_mem_desc) + platform::Place cpu_place, const Tensor* x, + Tensor* out, float scale_x, float scale_y, + const std::vector& extended_x_dims) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - const auto src0_tz = phi::vectorize(out->dims()); - const auto src0_md = dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType(), platform::GetPlainMKLDNNFormat(src0_tz.size())); - - const auto src1_md = x_mem_desc; + const auto src1_md = x->mem_desc().reshape(extended_x_dims); dnnl::primitive_attr attributes; attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); @@ -743,9 +725,9 @@ class BroadcastDataMKLDNNHandler } template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T_out* ptr = output->mutable_data( - this->place_, this->fwd_pd_->dst_desc().get_size()); + std::shared_ptr AcquireZeroedDstMemory(framework::Tensor* out) { + T_out* ptr = out->mutable_data(this->place_, + this->fwd_pd_->dst_desc().get_size()); memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); } @@ -758,22 +740,18 @@ class ReductionMKLDNNHandler ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p, const float eps, const dnnl::engine engine, platform::Place cpu_place, const Tensor* x, - const Tensor* y, std::vector y_tz, - const dnnl::primitive_attr& attr = NULL) + const Tensor* out, std::vector out_tz, + const dnnl::primitive_attr& attrs = NULL) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - - const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType(), - dnnl::memory::format_tag::any); + const auto out_md = memory::desc(out_tz, platform::MKLDNNGetDataType(), + dnnl::memory::format_tag::any); - if (attr) - this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md, - p, eps); + if (attrs) + this->AcquireForwardPrimitiveDescriptor(attrs, algo, x->mem_desc(), + out_md, p, eps); else - this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p, + this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), out_md, p, eps); } }; diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py index 46ee2a14a2..7b0bb706ae 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py @@ -14,7 +14,7 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, skip_check_grad_ci import paddle.fluid as fluid import paddle @@ -92,6 +92,17 @@ class TestReduceSum4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp): self.outputs = {'Out': self.inputs['X'].sum()} +@OpTestTool.skip_if_not_cpu() +class TestReduceSum4DNoReduceSimpleCopyOneDNNOp( + TestReduceDefaultWithGradOneDNNOp): + def setUp(self): + self.op_type = "reduce_sum" + self.use_mkldnn = True + self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")} + self.attrs = {'dim': tuple(), 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': np.copy(self.inputs['X'])} + + @skip_check_grad_ci( reason="reduce_max is discontinuous non-derivable function," " its gradient check is not supported by unittest framework.") -- GitLab