From 338cbeaacfb7bb01a6fbffe9c9105d64454fbcd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Wed, 4 Jan 2023 07:35:38 +0100 Subject: [PATCH] Revert "Replace matmul with matmul_v2 during oneDNN fuse passes (#49108)" (#49524) This reverts commit 2c444dfa3dc54a4cb8c53357df81dbab58f71503. --- .../matmul_activation_mkldnn_fuse_pass.cc | 10 - ...matmul_elementwise_add_mkldnn_fuse_pass.cc | 10 - ...tmul_transpose_reshape_mkldnn_fuse_pass.cc | 9 - .../mkldnn/operator_scale_onednn_fuse_pass.cc | 11 - ...shape_transpose_matmul_mkldnn_fuse_pass.cc | 9 - ...ranspose_matmul_mkldnn_fuse_pass_tester.cc | 2 +- paddle/fluid/operators/matmul_op.cc | 32 +- .../operators/mkldnn/matmul_mkldnn_op.cc | 630 ------------ .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 941 ++++++++++++++++++ paddle/fluid/operators/ops_extra_info.h | 2 +- ...test_mkldnn_matmul_activation_fuse_pass.py | 2 +- ...ul_elementwise_add_activation_fuse_pass.py | 2 +- ...mkldnn_matmul_elementwise_add_fuse_pass.py | 2 +- ...ldnn_matmul_transpose_reshape_fuse_pass.py | 2 +- ...n_matmul_v2_transpose_reshape_fuse_pass.py | 11 +- ...ldnn_reshape_transpose_matmul_fuse_pass.py | 2 +- .../unittests/mkldnn/test_matmul_mkldnn_op.py | 317 +++++- 17 files changed, 1304 insertions(+), 690 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc index 16bdfe6b05..07a608c5a2 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc @@ -77,16 +77,6 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct( ? "gelu_tanh" : "gelu_erf"; } - - if (matmul_type == "matmul") { - matmul_op->SetType("matmul_v2"); - matmul_op->SetAttr("trans_x", matmul_op->GetAttr("transpose_X")); - matmul_op->SetAttr("trans_y", matmul_op->GetAttr("transpose_Y")); - auto matmul_alpha = matmul_op->GetAttrIfExists("alpha"); - if (matmul_alpha != 1.0f) { - matmul_op->SetAttr("alpha", matmul_alpha); - } - } matmul_op->SetAttr("fuse_activation", act_type); matmul_op->SetOutput("Out", {activation_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc index 5590e5113e..f045377465 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc @@ -65,16 +65,6 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd( return; } - if (matmul_type == "matmul") { - matmul->Op()->SetType("matmul_v2"); - matmul->Op()->SetAttr("trans_x", matmul->Op()->GetAttr("transpose_X")); - matmul->Op()->SetAttr("trans_y", matmul->Op()->GetAttr("transpose_Y")); - auto matmul_alpha = matmul->Op()->GetAttrIfExists("alpha"); - if (matmul_alpha != 1.0f) { - matmul->Op()->SetAttr("alpha", matmul_alpha); - } - } - matmul->Op()->SetInput("ResidualData", {elementwise_addend->Name()}); matmul->Op()->SetOutput("Out", {elementwise_add_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc index 1d70722f73..40dbaa03a0 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc @@ -84,15 +84,6 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse( } OpDesc *matmul_desc = matmul_op->Op(); - if (matmul_type == "matmul") { - matmul_desc->SetType("matmul_v2"); - matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X")); - matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y")); - auto matmul_alpha = matmul_desc->GetAttrIfExists("alpha"); - if (matmul_alpha != 1.0f) { - matmul_desc->SetAttr("alpha", matmul_alpha); - } - } matmul_desc->SetOutput("Out", {reshape_out->Name()}); matmul_desc->SetAttr("fused_reshape_Out", reshape_shape); matmul_desc->SetAttr("fused_transpose_Out", transpose_axis); diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc index f28159a76a..cb06f6eb12 100644 --- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc @@ -85,17 +85,6 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph, scale = *(scale_tensor->data()); } - if (op_type == "matmul") { - operator_op->Op()->SetType("matmul_v2"); - operator_op->Op()->SetAttr("trans_x", - operator_op->Op()->GetAttr("transpose_X")); - operator_op->Op()->SetAttr("trans_y", - operator_op->Op()->GetAttr("transpose_Y")); - auto matmul_alpha = operator_op->Op()->GetAttrIfExists("alpha"); - if (matmul_alpha != 1.0f) { - operator_op->Op()->SetAttr("alpha", matmul_alpha); - } - } operator_op->Op()->SetAttr("fused_output_scale", scale); operator_op->Op()->SetOutput("Out", {scale_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index 4d26190a50..25a79509b5 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -123,15 +123,6 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( return; } - if (matmul_type == "matmul") { - matmul_desc->SetType("matmul_v2"); - matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X")); - matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y")); - auto matmul_alpha = matmul_desc->GetAttrIfExists("alpha"); - if (matmul_alpha != 1.0f) { - matmul_desc->SetAttr("alpha", matmul_alpha); - } - } matmul_desc->SetInput(matmul_input_name, {(reshape_in)->Name()}); matmul_desc->SetAttr("fused_reshape_" + matmul_input_name, reshape_shape); matmul_desc->SetAttr("fused_transpose_" + matmul_input_name, diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc index e5bba1a38f..2dd13573d9 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc @@ -97,7 +97,7 @@ void TestMain(const std::string& op_name, bool with_xshapes) { int removed = 8; // 2* reshape, reshape_out, transpose, transpose_out if (with_xshapes) removed += 2; // transpose_xshape, reshape_xshape EXPECT_EQ(total_nodes_before - removed, total_nodes_after); - auto* matmul_op_desc = GetOpNodes(graph, "matmul_v2").at(0)->Op(); + auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op(); auto check = [&matmul_op_desc](std::string a) { std::string shape_str = "fused_reshape_" + a; diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index b1c623b002..5c7474f65e 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -345,6 +345,26 @@ class MatMulGradKernel : public framework::OpKernel { } }; +framework::DDim GetDimForInput(const framework::InferShapeContext &ctx, + std::string input_name) { + auto shape = ctx.Attrs().Get>("fused_reshape_" + input_name); + auto axis = + ctx.Attrs().Get>("fused_transpose_" + input_name); + auto dim = ctx.GetInputDim(input_name); + + PADDLE_ENFORCE_GT(dim.size(), + 0, + platform::errors::InvalidArgument( + "The Input(%s) has not been initialized properly. The " + "shape of Input(%s) = [%s].", + dim)); + + if (!shape.empty() && !axis.empty()) { + dim = dim.reshape(shape).transpose(axis); + } + return dim; +} + template class MatMulDoubleGradKernel : public framework::OpKernel { public: @@ -559,8 +579,8 @@ class MatMulOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul"); OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "matmul"); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); + auto dim_x = GetDimForInput(*context, "X"); + auto dim_y = GetDimForInput(*context, "Y"); #ifdef PADDLE_WITH_MKLDNN // (jczaja): For NHWC execution output shape needs @@ -661,6 +681,14 @@ class MatMulOp : public framework::OperatorWithKernel { framework::DDim ddim_out = phi::make_ddim(dim_out); +#ifdef PADDLE_WITH_MKLDNN + auto shape = context->Attrs().Get>("fused_reshape_Out"); + auto axis = context->Attrs().Get>("fused_transpose_Out"); + + if (!shape.empty() && !axis.empty()) { + ddim_out = ddim_out.transpose(axis).reshape(shape); + } +#endif context->SetOutputDim("Out", ddim_out); context->ShareLoD("X", "Out"); } diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc deleted file mode 100644 index fd204e8cac..0000000000 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ /dev/null @@ -1,630 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace { -using dnnl::memory; -using paddle::framework::ExecutionContext; -using paddle::framework::GradVarName; -using phi::DenseTensor; -using phi::OneDNNContext; -using phi::vectorize; -using phi::funcs::OneDNNGetDataType; - -// Reshape a rank-3 tensor from P x M x N to (P * M) x N. -// Identity op if the tensor is not of rank 3. -static DenseTensor FoldOuterDims(const DenseTensor &input) { - auto output = input; - auto in_dims = input.dims(); - if (in_dims.size() == 3) { - output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); - } - return output; -} - -// Reshape a rank-3 tensor from P x M x N to M x (P * N). -// (Warning: This requires transposing data and writes into new memory.) -// Identity op if the tensor is not of rank 3. -template -static DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx, - const DenseTensor *input) { - auto input_dims = vectorize(input->dims()); - if (input_dims.size() != 3) { - return *input; - } - - DenseTensor output; - output.Resize({input_dims[1], input_dims[0], input_dims[2]}); - - auto output_dims = vectorize(output.dims()); - - memory::data_type input_type = phi::funcs::ToOneDNNDataType(input->dtype()); - phi::funcs::ReorderOneDNNHandler reorder_handler( - output_dims, input->dtype(), input_type, dev_ctx.GetEngine()); - - auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - memory::format_tag::abc, phi::funcs::to_void_cast(input->data())); - auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( - &output, memory::format_tag::bac, dev_ctx.GetPlace()); - auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, - reorder_dst_memory_p); - - auto &astream = OneDNNContext::tls().get_stream(); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); - return output; -} - -template -class MatMulV1OneDNNHandler - : public phi::funcs::OneDNNHandlerNoCachingT { - public: - MatMulV1OneDNNHandler(const ExecutionContext &ctx, - const dnnl::engine engine, - phi::Place cpu_place, - const std::vector &x_org_dims, - const std::vector &y_org_dims) - : phi::funcs::OneDNNHandlerNoCachingT(engine, - cpu_place) { - // M X K * K X N - std::vector x_dims(x_org_dims); - std::vector y_dims(y_org_dims); - - const int MB_idx = x_dims.size() - 3; - const int H_idx = x_dims.size() - 2; - const int W_idx = x_dims.size() - 1; - - auto trans_x = ctx.Attr("transpose_X"); - auto trans_y = ctx.Attr("transpose_Y"); - if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); - if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); - - const memory::dim M = x_dims[H_idx]; - const memory::dim K = x_dims[W_idx]; - const memory::dim N = y_dims[W_idx]; - - std::vector x_strides(x_dims.size() - 3, 1); - std::vector y_strides(x_dims.size() - 3, 1); - std::vector out_strides(x_dims.size() - 3, 1); - std::vector out_ddims(x_dims.size() - 3, 1); - - x_strides.reserve(x_dims.size()); - y_strides.reserve(x_dims.size()); - out_strides.reserve(x_dims.size()); - - if (trans_x) { - x_strides.insert(x_strides.end(), {M * K, 1, M}); - } else { - x_strides.insert(x_strides.end(), {M * K, K, 1}); - } - - if (trans_y) { - y_strides.insert(y_strides.end(), {N * K, 1, K}); - } else { - y_strides.insert(y_strides.end(), {N * K, N, 1}); - } - - out_strides.insert(out_strides.end(), {M * N, N, 1}); - out_ddims.insert(out_ddims.end(), - {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); - - for (int i = x_dims.size() - 4; i >= 0; --i) { - out_ddims[i] = std::max(x_dims[i], y_dims[i]); - x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; - y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; - out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; - } - - auto x_md = - memory::desc(x_dims, phi::funcs::OneDNNGetDataType(), x_strides); - auto y_md = - memory::desc(y_dims, phi::funcs::OneDNNGetDataType(), y_strides); - auto out_md = memory::desc( - out_ddims, phi::funcs::OneDNNGetDataType(), out_strides); - - dnnl::primitive_attr matmul_attrs; - dnnl::post_ops post_operations; - float scale_out = ComputeOutputScale(ctx); - if (scale_out != 1.0f) { - matmul_attrs.set_output_scales(0, {scale_out}); - } - matmul_attrs.set_post_ops(post_operations); - - this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md); - } - - MatMulV1OneDNNHandler(const dnnl::engine engine, - phi::Place cpu_place, - DenseTensor *x, - bool trans_x, - DenseTensor *y, - bool trans_y, - DenseTensor *out, - float scale) - : phi::funcs::OneDNNHandlerNoCachingT(engine, - cpu_place) { - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y); - - memory::dim x_bs = mat_dim_x.batch_size_; - memory::dim y_bs = mat_dim_y.batch_size_; - - memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; - const memory::dim M = mat_dim_x.height_; - const memory::dim N = mat_dim_y.width_; - const memory::dim K = mat_dim_x.width_; - - memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; - memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; - memory::dims out_dims = {out_bs, M, N}; - - memory::dims x_strides = - trans_x ? memory::dims{M * K, 1, M} : memory::dims{M * K, K, 1}; - - memory::dims y_strides = - trans_y ? memory::dims{N * K, 1, K} : memory::dims{N * K, N, 1}; - memory::dims out_strides = memory::dims{M * N, N, 1}; - - auto x_md = memory::desc(x_dims, OneDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, OneDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_dims, OneDNNGetDataType(), out_strides); - - dnnl::primitive_attr attrs; - if (scale != 1.0f) attrs.set_output_scales(0, {scale}); - - this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); - } - - float ComputeOutputScale(const ExecutionContext &ctx) { - float alpha = ctx.Attr("alpha"); - if (ctx.HasAttr("Scale_x") && ctx.HasAttr("Scale_y") && - ctx.HasAttr("Scale_out")) { - float scale_x = ctx.Attr("Scale_x"); - float scale_y = ctx.Attr("Scale_y"); - bool force_fp32_out = ctx.HasAttr("force_fp32_output") - ? ctx.Attr("force_fp32_output") - : false; - float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); - alpha *= scale_out / (scale_x * scale_y); - } - return alpha; - } - - std::shared_ptr AcquireWeightsMemory(const DenseTensor *input) { - const YT *input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->weights_desc(), - phi::funcs::to_void_cast(input_data)); - } - - std::shared_ptr AcquireDstMemory(DenseTensor *output) { - // We cannot use base AcquireDstMemory as it makes an allocation request - // base on DST memory primitive size. This is fine in general, but in MatMul - // we have primitive that covers only one batch of Data and then shift - // pointer for every new batch. Hence DenseTensor size is bigger that - // dst memory primitive size. So would we request less memory that is there - // and it triggers an assertion. So as there is no 'any' format here we can - // leave default size of DenseTensor as computed in ComputeInferShape - OT *ptr = output->mutable_data(this->place_); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); - } - - private: - uint16_t batch_size_; -}; - -/** - * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. - * - * The shape would be [BatchSize, H, W] or [H, W]. - * If transposed, `H,W` will be swapped. - */ -static void ReshapeTensorToMatrixSequence( - DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) { - int64_t h, w; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - if (descriptor.batch_size_) { - x->Resize({descriptor.batch_size_, h, w}); - } else { - x->Resize({h, w}); - } -} - -/** - * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor - * Out = matmul(x, y) - * - * This method will first calculate X,Y matrix sequence, and then calculate - * the out shape. - * - * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] - * The out = [BatchSize, H1, W2] - * - * If there is no batch size in `X` and `Y`, the out will be [H1, W2] - * If any of `X` and `Y` has batch size BatchSize, the out will have the - * BatchSize. - */ -static void ReshapeXYOutToMatrixSequence(DenseTensor *x, - DenseTensor *y, - DenseTensor *out, - bool trans_x, - bool trans_y) { - auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims()); - auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims()); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); - if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { - out->Resize({mat_dim_x.height_, mat_dim_y.width_}); - } else { - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, - mat_dim_y.width_}); - } - - ReshapeTensorToMatrixSequence(x, mat_dim_x); - ReshapeTensorToMatrixSequence(y, mat_dim_y); -} - -std::vector Transpose(const std::vector &x, - const std::vector &axis) { - size_t in_rank = x.size(); - size_t axis_size = axis.size(); - - auto axis_set = std::set(axis.begin(), axis.end()); - PADDLE_ENFORCE_EQ(axis_set.size(), - axis_size, - phi::errors::InvalidArgument( - "In an axis array, elements must be unique.")); - - PADDLE_ENFORCE_EQ( - in_rank, - axis_size, - phi::errors::InvalidArgument("The input dimension's size " - "should be equal to the axis's size. " - "But received dimension is %d, " - "axis's size is %d", - in_rank, - axis_size)); - - PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), - axis_size, - phi::errors::InvalidArgument( - "Axis values must be ranging from 0 to (dims - 1).")); - - std::vector new_x(x.size()); - for (size_t i = 0; i < x.size(); i++) { - new_x[i] = x[axis[i]]; - } - return new_x; -} - -template -void ExecuteMatMul(const ExecutionContext &ctx, - const DenseTensor *x, - const std::vector &x_dims, - const DenseTensor *y, - const std::vector &y_dims, - DenseTensor *out) { - const auto &dev_ctx = ctx.template device_context(); - MatMulV1OneDNNHandler handler( - ctx, dev_ctx.GetEngine(), ctx.GetPlace(), x_dims, y_dims); - - const auto src_memory_p = handler.AcquireSrcMemory(x); - const auto weights_memory_p = handler.AcquireWeightsMemory(y); - const auto dst_memory_p = handler.AcquireDstMemory(out); - - auto matmul_p = handler.AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - auto &astream = OneDNNContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); - - out->set_mem_desc( - dst_memory_p->get_desc().reshape(vectorize(out->dims()))); -} - -template -class MatMulV1OneDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const ExecutionContext &ctx) const override { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), - 1, - phi::errors::Unimplemented( - "oneDNN matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - constexpr bool is_int8 = phi::funcs::is_int8(); - constexpr bool is_bfloat16 = phi::funcs::is_bfloat16(); - const bool force_fp32_output = ctx.HasAttr("force_fp32_output") - ? ctx.Attr("force_fp32_output") - : false; - constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses - - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Y"); - auto *out = ctx.Output("Out"); - - auto x_dims = vectorize(x->dims()); - auto y_dims = vectorize(y->dims()); - - int ndims = std::max(x_dims.size(), y_dims.size()); - ndims = std::max(ndims, 3); - - std::vector x_bd_dims(ndims, 1); - std::vector y_bd_dims(ndims, 1); - - CalculateMatrixDims(x_dims, y_dims, &x_bd_dims, &y_bd_dims, out); - - if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { - ExecuteMatMul(ctx, x, x_bd_dims, y, y_bd_dims, out); - } else if (is_bfloat16) { - ExecuteMatMul( - ctx, x, x_bd_dims, y, y_bd_dims, out); - } else if (fuse_relu) { - ExecuteMatMul(ctx, x, x_bd_dims, y, y_bd_dims, out); - } else { - ExecuteMatMul(ctx, x, x_bd_dims, y, y_bd_dims, out); - } - } - - private: - void CalculateMatrixDims(const std::vector &x_dims, - const std::vector &y_dims, - std::vector *x_bd_dims, - std::vector *y_bd_dims, - DenseTensor *out) const { - if (x_dims.size() == 1) { - (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0]; - } else if (x_dims.size() == 2) { - (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1]; - (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0]; - } else { - for (size_t i = 0; i < x_dims.size(); ++i) { - (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i]; - } - } - if (y_dims.size() == 1) { - (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0]; - } else if (y_dims.size() == 2) { - (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1]; - (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0]; - } else { - for (size_t i = 0; i < y_dims.size(); ++i) { - (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i]; - } - } - - if (x_dims.size() > 2 && y_dims.size() > 2) { - auto out_dims = vectorize(out->dims()); - for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) { - PADDLE_ENFORCE_EQ( - (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 || - (*y_bd_dims)[i] == 1, - true, - phi::errors::InvalidArgument( - "DenseTensor dimensions are incorrect for broadcasting." - "Dimensions in X and Y must be same or equal to 1, but " - "received x_dim[%d]=%d and y_dims[%d]= %d", - i, - (*x_bd_dims)[i], - i, - (*y_bd_dims)[i])); - (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); - } - out->Resize(phi::make_ddim((out_dims))); - } - } -}; - -template -class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const ExecutionContext &ctx) const override { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), - 1, - phi::errors::Unimplemented( - "oneDNN matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - - const auto &dev_ctx = ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - auto x = *ctx.Input("X"); - auto y = *ctx.Input("Y"); - auto dout = *ctx.Input(paddle::framework::GradVarName("Out")); - auto *dx = ctx.Output(paddle::framework::GradVarName("X")); - auto *dy = ctx.Output(paddle::framework::GradVarName("Y")); - - bool transpose_x = ctx.Attr("transpose_X"); - bool transpose_y = ctx.Attr("transpose_Y"); - - ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - - phi::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } - } - - phi::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - if (transpose_x && transpose_y) { - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &y, true, true, &dout, true, false, dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, true, true, &x, true, false, dy); - } else if (transpose_x) { - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &y, - false, - false, - &dout, - true, - false, - dx); - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &x, - false, - false, - &dout, - false, - true, - dy); - } else if (transpose_y) { - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &dout, - false, - false, - &y, - false, - true, - dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, true, true, &x, false, true, dy); - } else { - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &dout, - false, - false, - &y, - true, - false, - dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &x, true, true, &dout, false, true, dy); - } - - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - dx->set_mem_desc(x.mem_desc()); - } - } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - dy->set_mem_desc(y.mem_desc()); - } - } - } - - private: - void ExecuteMatMulGrad(const ExecutionContext &ctx, - const OneDNNContext &dev_ctx, - const dnnl::engine &engine, - DenseTensor *x, - bool trans_x, - bool is_fold_init_dims_x, - DenseTensor *y, - bool trans_y, - bool is_fold_init_dims_y, - DenseTensor *out) const { - // gradient is calculated in a different way when broadcasting is used - bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && - out->dims().size() == 2; - - DenseTensor x_combined, y_combined; - if (need_combine) { - x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) - : FoldFirstAndLastDims(dev_ctx, x); - y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) - : FoldFirstAndLastDims(dev_ctx, y); - } else { - x_combined = *x; - y_combined = *y; - } - - float alpha = ctx.Attr("alpha"); - - MatMulV1OneDNNHandler handler(engine, - ctx.GetPlace(), - &x_combined, - trans_x, - &y_combined, - trans_y, - out, - alpha); - - const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); - const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); - const auto dst_memory_p = handler.AcquireDstMemory(out); - - auto matmul_p = handler.AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - auto &astream = OneDNNContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); - - out->set_mem_desc( - dst_memory_p->get_desc().reshape(vectorize(out->dims()))); - } -}; - -} // anonymous namespace - -REGISTER_OP_KERNEL(matmul, - MKLDNN, - ::phi::CPUPlace, - MatMulV1OneDNNKernel, - MatMulV1OneDNNKernel, - MatMulV1OneDNNKernel, - MatMulV1OneDNNKernel); - -REGISTER_OP_KERNEL(matmul_grad, - MKLDNN, - ::phi::CPUPlace, - MatMulV1GradOneDNNKernel, - MatMulV1GradOneDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc new file mode 100644 index 0000000000..7bf66bae93 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -0,0 +1,941 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace { +using dnnl::memory; +using paddle::framework::ExecutionContext; +using paddle::framework::GradVarName; +using phi::OneDNNContext; +using phi::vectorize; +using phi::funcs::OneDNNGetDataType; + +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +static phi::DenseTensor FoldOuterDims(const phi::DenseTensor &input) { + auto output = input; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +static phi::DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx, + const phi::DenseTensor *input) { + auto input_dims = vectorize(input->dims()); + if (input_dims.size() != 3) { + return *input; + } + + phi::DenseTensor output; + output.Resize({input_dims[1], input_dims[0], input_dims[2]}); + + auto output_dims = vectorize(output.dims()); + + memory::data_type input_type = phi::funcs::ToOneDNNDataType(input->dtype()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + output_dims, input->dtype(), input_type, dev_ctx.GetEngine()); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + memory::format_tag::abc, phi::funcs::to_void_cast(input->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + &output, memory::format_tag::bac, dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, + reorder_dst_memory_p); + + auto &astream = OneDNNContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); + return output; +} + +phi::DDim GetDimForInput(const ExecutionContext &ctx, std::string input_name) { + auto shape = ctx.Attr>("fused_reshape_" + input_name); + auto axis = ctx.Attr>("fused_transpose_" + input_name); + auto input_dims = ctx.Input(input_name)->dims(); + if (!shape.empty() && !axis.empty()) { + return input_dims.reshape(shape).transpose(axis); + } + return input_dims; +} + +template +class MatMulV2MKLDNNHandler + : public phi::funcs::OneDNNHandlerNoCachingT { + public: + MatMulV2MKLDNNHandler(const ExecutionContext &ctx, + const dnnl::engine engine, + paddle::platform::Place cpu_place, + const std::vector &x_org_dims, + bool trans_x, + const std::vector &y_org_dims, + bool trans_y, + bool is_output_fused, + const std::vector &x_strides_override, + const std::vector &y_strides_override) + : phi::funcs::OneDNNHandlerNoCachingT(engine, + cpu_place) { + // M X K * K X N + std::vector x_dims(x_org_dims); + std::vector y_dims(y_org_dims); + + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (!x_strides_override.empty()) { + x_strides = x_strides_override; + } else { + if (!trans_x) { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } else { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } + } + + if (!y_strides_override.empty()) { + y_strides = y_strides_override; + } else { + if (!trans_y) { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } else { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + if (x_strides_override.empty()) { + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + } + if (y_strides_override.empty()) { + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + } + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + // TODO(jczaja): Why not for int8?? + if (!phi::funcs::is_int8() && is_output_fused) { + out_strides = FakeTransposeStrides(out_ddims); + } + + auto x_md = + memory::desc(x_dims, phi::funcs::OneDNNGetDataType(), x_strides); + auto y_md = + memory::desc(y_dims, phi::funcs::OneDNNGetDataType(), y_strides); + auto out_md = memory::desc( + out_ddims, phi::funcs::OneDNNGetDataType(), out_strides); + + const dnnl::primitive_attr matmul_attrs = CreateMatmulAttrs(ctx); + + this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md); + } + + void AppendActivation(const ExecutionContext &ctx, + dnnl::post_ops &post_ops, // NOLINT + float activation_scale = 1.0f) { + const auto invalid_attribute = + ctx.HasAttr("fuse_activation") + ? ctx.Attr("fuse_activation").empty() + : true; + if (invalid_attribute) return; + + const auto fuse_activation = ctx.Attr("fuse_activation"); + const auto fuse_alpha = + ctx.HasAttr("fuse_alpha") ? ctx.Attr("fuse_alpha") : 0.0f; + const auto fuse_beta = + ctx.HasAttr("fuse_beta") ? ctx.Attr("fuse_beta") : 0.0f; + + if (fuse_activation == "hard_sigmoid") { + post_ops.append_eltwise(activation_scale, + dnnl::algorithm::eltwise_linear, + fuse_alpha, + fuse_beta); + post_ops.append_eltwise( + activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); + } else { + const std::unordered_map activation_map = { + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"hard_swish", dnnl::algorithm::eltwise_hardswish}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"mish", dnnl::algorithm::eltwise_mish}, + {"relu", dnnl::algorithm::eltwise_relu}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"tanh", dnnl::algorithm::eltwise_tanh}}; + + const auto &activation_type = activation_map.find(fuse_activation); + + PADDLE_ENFORCE_NE( + activation_type, + activation_map.end(), + phi::errors::InvalidArgument( + "Activation '%s' not found in oneDNN algorithms mapper", + fuse_activation)); + + post_ops.append_eltwise( + activation_scale, activation_type->second, fuse_alpha, fuse_beta); + } + } + + float ComputeOutputScale(const ExecutionContext &ctx) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + if (ctx.HasAttr("Scale_x") && ctx.HasAttr("Scale_y") && + ctx.HasAttr("Scale_out")) { + float scale_x = ctx.Attr("Scale_x"); + float scale_y = ctx.Attr("Scale_y"); + bool force_fp32_out = ctx.HasAttr("force_fp32_output") + ? ctx.Attr("force_fp32_output") + : false; + float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); + alpha *= scale_out / (scale_x * scale_y); + } + return alpha; + } + + dnnl::primitive_attr CreateMatmulAttrs(const ExecutionContext &ctx) { + dnnl::primitive_attr matmul_attrs; + dnnl::post_ops post_operations; + + float scale_out = ComputeOutputScale(ctx); + if (scale_out != 1.0f) { + matmul_attrs.set_output_scales(0, {scale_out}); + } + + if (ctx.HasInput("ResidualData")) { + auto *residual_data = ctx.Input("ResidualData"); + auto residual_data_tz = phi::vectorize(residual_data->dims()); + auto residual_data_md = memory::desc(residual_data_tz, + phi::funcs::OneDNNGetDataType(), + dnnl::memory::format_tag::any); + post_operations.append_binary(dnnl::algorithm::binary_add, + residual_data_md); + if (ctx.HasAttr("Scale_in_eltwise")) { + float sum_scale = scale_out / ctx.Attr("Scale_in_eltwise"); + post_operations.append_sum(sum_scale); + } + } + + AppendActivation(ctx, post_operations); + + if (ctx.HasAttr("fused_output_scale")) { + float scale_alpha = ctx.Attr("fused_output_scale"); + post_operations.append_eltwise( + 1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f); + } + + matmul_attrs.set_post_ops(post_operations); + return matmul_attrs; + } + + std::vector FakeTransposeStrides( + const std::vector &matmul_out_dims) const { + // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and + // transpose axis are: {0, 2, 1, 3} + std::vector transpose_axis = {0, 2, 1, 3}; + std::vector fake_strides(transpose_axis.size()); + int ndims = static_cast(transpose_axis.size()); + + int total_stride = 1; + + for (int i = ndims - 1; i >= 0; --i) { + fake_strides[transpose_axis[i]] = total_stride; + total_stride *= matmul_out_dims[transpose_axis[i]]; + } + + return fake_strides; + } + + std::shared_ptr AcquireWeightsMemory(const phi::DenseTensor *input) { + const YT *input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), + phi::funcs::to_void_cast(input_data)); + } + + std::shared_ptr AcquireDstMemory(phi::DenseTensor *output) { + // We cannot use base AcquireDstMemory as it makes an allocation request + // base on DST memory primitive size. This is fine in general, but in MatMul + // we have primitive that covers only one batch of Data and then shift + // pointer for every new batch. Hence phi::DenseTensor size is bigger that + // dst memory primitive size. So would we request less memory that is there + // and it triggers an assertion. So as there is no 'any' format here we can + // leave default size of phi::DenseTensor as computed in ComputeInferShape + OT *ptr = output->mutable_data(this->place_); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); + } +}; + +template +class MatMulMKLDNNHandler + : public phi::funcs::OneDNNHandlerNoCachingT { + public: + MatMulMKLDNNHandler(const dnnl::engine engine, + paddle::platform::Place cpu_place, + phi::DenseTensor *x, + bool trans_x, + phi::DenseTensor *y, + bool trans_y, + phi::DenseTensor *out, + float scale) + : phi::funcs::OneDNNHandlerNoCachingT(engine, + cpu_place) { + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x); + auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y); + + memory::dim x_bs = mat_dim_x.batch_size_; + memory::dim y_bs = mat_dim_y.batch_size_; + + memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; + const memory::dim M = mat_dim_x.height_; + const memory::dim N = mat_dim_y.width_; + const memory::dim K = mat_dim_x.width_; + + memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; + memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; + memory::dims out_dims = {out_bs, M, N}; + + memory::dims x_strides = + !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M}; + + memory::dims y_strides = + !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; + memory::dims out_strides = memory::dims{M * N, N, 1}; + + auto x_md = memory::desc(x_dims, OneDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, OneDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_dims, OneDNNGetDataType(), out_strides); + + dnnl::primitive_attr attrs; + if (scale != 1.0f) attrs.set_output_scales(0, {scale}); + + this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); + } + + std::shared_ptr AcquireWeightsMemory(const phi::DenseTensor *input) { + const YT *input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), + phi::funcs::to_void_cast(input_data)); + } + + public: + void Execute(const phi::DenseTensor *x, + const phi::DenseTensor *y, + phi::DenseTensor *out) { + const auto src_memory_p = this->AcquireSrcMemory(x); + const auto weights_memory_p = this->AcquireWeightsMemory(y); + const auto dst_memory_p = this->AcquireDstMemory(out); + + auto matmul_p = this->AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = OneDNNContext::tls().get_stream(); + + // Simulate batch matmul by processing in loop + void *x_ptr = src_memory_p->get_data_handle(); + void *y_ptr = weights_memory_p->get_data_handle(); + void *out_ptr = dst_memory_p->get_data_handle(); + auto offsets = std::make_tuple(x_offset_, y_offset_, out_offset_); + for (uint16_t i = 0; i < batch_size_; ++i) { + src_memory_p->set_data_handle(x_ptr); + weights_memory_p->set_data_handle(y_ptr); + dst_memory_p->set_data_handle(out_ptr); + matmul_p->execute(astream, matmul_args); + x_ptr = static_cast(x_ptr) + std::get<0>(offsets); + y_ptr = static_cast(y_ptr) + std::get<1>(offsets); + out_ptr = static_cast(out_ptr) + std::get<2>(offsets); + } + astream.wait(); + + out->set_mem_desc(dst_memory_p->get_desc().reshape(out->dims())); + } + + std::shared_ptr AcquireDstMemory(phi::DenseTensor *output) { + // We cannot use base AcquireDstMemory as it makes an allocation request + // base on DST memory primitive size. This is fine in general, but in MatMul + // we have primitive that covers only one batch of Data and then shift + // pointer for every new batch. Hence phi::DenseTensor size is bigger that + // dst memory primitive size. So would we request less memory that is there + // and it triggers an assertion. So as there is no 'any' format here we can + // leave default size of phi::DenseTensor as computed in ComputeInferShape + OT *ptr = output->mutable_data(this->place_); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); + } + + private: + uint32_t x_offset_; + uint32_t y_offset_; + uint32_t out_offset_; + uint16_t batch_size_; +}; + +/** + * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. + * + * The shape would be [BatchSize, H, W] or [H, W]. + * If transposed, `H,W` will be swapped. + */ +static void ReshapeTensorToMatrixSequence( + phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) { + int64_t h, w; + h = descriptor.height_; + w = descriptor.width_; + if (descriptor.trans_) { + std::swap(w, h); + } + if (descriptor.batch_size_) { + x->Resize({descriptor.batch_size_, h, w}); + } else { + x->Resize({h, w}); + } +} + +/** + * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor + * Out = matmul(x, y) + * + * This method will first calculate X,Y matrix sequence, and then calculate + * the out shape. + * + * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] + * The out = [BatchSize, H1, W2] + * + * If there is no batch size in `X` and `Y`, the out will be [H1, W2] + * If any of `X` and `Y` has batch size BatchSize, the out will have the + * BatchSize. + */ +static void ReshapeXYOutToMatrixSequence(phi::DenseTensor *x, + phi::DenseTensor *y, + phi::DenseTensor *out, + bool trans_x, + bool trans_y) { + auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims()); + auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims()); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); + auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); + if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { + out->Resize({mat_dim_x.height_, mat_dim_y.width_}); + } else { + out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), + mat_dim_x.height_, + mat_dim_y.width_}); + } + + ReshapeTensorToMatrixSequence(x, mat_dim_x); + ReshapeTensorToMatrixSequence(y, mat_dim_y); +} + +std::vector Transpose(const std::vector &x, + const std::vector &axis) { + size_t in_rank = x.size(); + size_t axis_size = axis.size(); + + auto axis_set = std::set(axis.begin(), axis.end()); + PADDLE_ENFORCE_EQ(axis_set.size(), + axis_size, + paddle::platform::errors::InvalidArgument( + "In an axis array, elements must be unique.")); + + PADDLE_ENFORCE_EQ(in_rank, + axis_size, + paddle::platform::errors::InvalidArgument( + "The input dimension's size " + "should be equal to the axis's size. " + "But received dimension is %d, " + "axis's size is %d", + in_rank, + axis_size)); + + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), + axis_size, + paddle::platform::errors::InvalidArgument( + "Axis values must be ranging from 0 to (dims - 1).")); + + std::vector new_x(x.size()); + for (size_t i = 0; i < x.size(); i++) { + new_x[i] = x[axis[i]]; + } + return new_x; +} + +std::vector GetInputStrides(const ExecutionContext &ctx, + const std::string input_name) { + auto shape = ctx.Attr>("fused_reshape_" + input_name); + auto axis = ctx.Attr>("fused_transpose_" + input_name); + auto input_dims = ctx.Input(input_name)->dims(); + auto new_dims = input_dims; + if (!shape.empty() && !axis.empty()) { + new_dims = input_dims.reshape(shape).transpose(axis); + } + + auto &MatrixDimsFromVector = input_name == "X" + ? phi::funcs::RowMatrixDimsFromVector + : phi::funcs::ColumnMatrixDimsFromVector; + phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( + MatrixDimsFromVector(new_dims), + 0, + ctx.HasAttr("trans_x") + ? ctx.Attr(std::string("trans_") + + static_cast(std::tolower(input_name[0]))) + : ctx.Attr(std::string("transpose_") + input_name[0])); + + std::vector strides; + if (!shape.empty()) { + auto shape2 = input_dims.reshape(shape); + strides.push_back(1); + for (auto i = shape2.size() - 1; i > 0; --i) { + strides.insert(strides.begin(), + strides.front() * static_cast(shape2[i])); + } + strides = Transpose(strides, axis); + if (shape.size() == 2) + strides.insert(strides.begin(), + static_cast(shape[0] * shape[1])); + mat_dim.stride_ = strides[0]; + if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin())); + } + return strides; +} + +bool IsOutputFused(const ExecutionContext &ctx) { + auto &fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); + auto &fused_transpose_Out = ctx.Attr>("fused_transpose_Out"); + return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); +} + +template +void ExecuteMatMulV2(const ExecutionContext &ctx, + const dnnl::engine onednn_engine, + const phi::DenseTensor *x, + const std::vector &x_dims, + bool trans_x, + const phi::DenseTensor *y, + const std::vector &y_dims, + bool trans_y, + phi::DenseTensor *out) { + std::vector x_strides_override = GetInputStrides(ctx, "X"); + std::vector y_strides_override = GetInputStrides(ctx, "Y"); + MatMulV2MKLDNNHandler handler(ctx, + onednn_engine, + ctx.GetPlace(), + x_dims, + trans_x, + y_dims, + trans_y, + IsOutputFused(ctx), + x_strides_override, + y_strides_override); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (ctx.HasInput("ResidualData")) { + auto *residual_data = ctx.Input("ResidualData"); + const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data); + matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *residual_data_memory_p}); + } + + auto &astream = OneDNNContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need + // permute + if (IsOutputFused(ctx) && !phi::funcs::is_int8()) { + auto axis = ctx.Attr>("fused_transpose_Out"); + auto permuted_md = dst_memory_p->get_desc().permute_axes(axis); + out->set_mem_desc(permuted_md.reshape(vectorize(out->dims()))); + } else { + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); + } +} + +template +class MatMulMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), + 1, + paddle::platform::errors::Unimplemented( + "oneDNN matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + constexpr bool is_int8 = phi::funcs::is_int8(); + constexpr bool is_bfloat16 = phi::funcs::is_bfloat16(); + const bool force_fp32_output = ctx.HasAttr("force_fp32_output") + ? ctx.Attr("force_fp32_output") + : false; + constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses + + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *out = ctx.Output("Out"); + bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr("trans_x") + : ctx.Attr("transpose_X"); + bool trans_y = ctx.HasAttr("trans_y") ? ctx.Attr("trans_y") + : ctx.Attr("transpose_Y"); + + auto x_dims = vectorize(GetDimForInput(ctx, "X")); + auto y_dims = vectorize(GetDimForInput(ctx, "Y")); + + int ndims = std::max(x_dims.size(), y_dims.size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + CalculateMatrixDims(ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, out); + + if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); + } else if (is_bfloat16) { + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); + } else if (fuse_relu) { + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); + } else { + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); + } + } + + private: + void CalculateMatrixDims(const ExecutionContext &ctx, + const std::vector &x_dims, + const std::vector &y_dims, + std::vector *x_bd_dims, + std::vector *y_bd_dims, + phi::DenseTensor *out) const { + if (x_dims.size() == 1) { + (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0]; + } else if (x_dims.size() == 2) { + (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1]; + (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0]; + } else { + for (size_t i = 0; i < x_dims.size(); ++i) { + (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i]; + } + } + if (y_dims.size() == 1) { + (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0]; + } else if (y_dims.size() == 2) { + (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1]; + (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0]; + } else { + for (size_t i = 0; i < y_dims.size(); ++i) { + (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i]; + } + } + + if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) { + auto out_dims = vectorize(out->dims()); + for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) { + PADDLE_ENFORCE_EQ( + (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 || + (*y_bd_dims)[i] == 1, + true, + paddle::platform::errors::InvalidArgument( + "phi::DenseTensor dimensions are incorrect for broadcasting." + "Dimensions in X and Y must be same or equal to 1, but " + "received x_dim[%d]=%d and y_dims[%d]= %d", + i, + (*x_bd_dims)[i], + i, + (*y_bd_dims)[i])); + (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); + } + out->Resize(phi::make_ddim((out_dims))); + } + } +}; + +template +class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), + 1, + paddle::platform::errors::Unimplemented( + "oneDNN matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + auto x = *ctx.Input("X"); + auto y = *ctx.Input("Y"); + auto dout = + *ctx.Input(paddle::framework::GradVarName("Out")); + auto *dx = + ctx.Output(paddle::framework::GradVarName("X")); + auto *dy = + ctx.Output(paddle::framework::GradVarName("Y")); + + bool transpose_x = ctx.HasAttr("transpose_X") + ? ctx.Attr("transpose_X") + : ctx.Attr("trans_x"); + bool transpose_y = ctx.HasAttr("transpose_Y") + ? ctx.Attr("transpose_Y") + : ctx.Attr("trans_y"); + + ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + + paddle::framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } + + paddle::framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } + + if (transpose_x && transpose_y) { + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &y, true, true, &dout, true, false, dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &dout, true, true, &x, true, false, dy); + } else if (transpose_x) { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &y, + false, + false, + &dout, + true, + false, + dx); + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &x, + false, + false, + &dout, + false, + true, + dy); + } else if (transpose_y) { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &dout, + false, + false, + &y, + false, + true, + dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &dout, true, true, &x, false, true, dy); + } else { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &dout, + false, + false, + &y, + true, + false, + dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &x, true, true, &dout, false, true, dy); + } + + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + dx->set_mem_desc(x.mem_desc()); + } + } + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + dy->set_mem_desc(y.mem_desc()); + } + } + } + + private: + void ExecuteMatMulGrad(const ExecutionContext &ctx, + const OneDNNContext &dev_ctx, + const dnnl::engine &engine, + phi::DenseTensor *x, + bool trans_x, + bool is_fold_init_dims_x, + phi::DenseTensor *y, + bool trans_y, + bool is_fold_init_dims_y, + phi::DenseTensor *out) const { + // gradient is calculated in a different way when broadcasting is used + bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && + out->dims().size() == 2; + + phi::DenseTensor x_combined, y_combined; + if (!need_combine) { + x_combined = *x; + y_combined = *y; + } else { + x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) + : FoldFirstAndLastDims(dev_ctx, x); + y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) + : FoldFirstAndLastDims(dev_ctx, y); + } + + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + + MatMulMKLDNNHandler handler(engine, + ctx.GetPlace(), + &x_combined, + trans_x, + &y_combined, + trans_y, + out, + alpha); + + const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = OneDNNContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); + } +}; + +} // anonymous namespace + +REGISTER_OP_KERNEL(matmul, + MKLDNN, + ::phi::CPUPlace, + MatMulMKLDNNKernel, + MatMulMKLDNNKernel, + MatMulMKLDNNKernel, + MatMulMKLDNNKernel); + +REGISTER_OP_KERNEL(matmul_grad, + MKLDNN, + ::phi::CPUPlace, + MatMulGradMKLDNNKernel, + MatMulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 001705a3c0..94f0fa2a60 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -99,7 +99,7 @@ const std::unordered_map {"fuse_alpha", ExtraAttrProperty::ONEDNN}, {"fuse_beta", ExtraAttrProperty::ONEDNN}, {"fuse_relu", ExtraAttrProperty::ONEDNN}, - {"alpha", ExtraAttrProperty::ONEDNN}, + {"fused_output_scale", ExtraAttrProperty::ONEDNN}, {"fuse_residual_connection", ExtraAttrProperty::ONEDNN}, {"fuse_with_relu", ExtraAttrProperty::ONEDNN}, {"fused_reshape_Out", ExtraAttrProperty::ONEDNN}, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py index 510fa4f79b..964aad16b9 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py @@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest): 'operator_scale_onednn_fuse_pass', ], ) - yield config, ['matmul_v2'], (1e-5, 1e-5) + yield config, ['matmul'], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py index cdfbed4c50..0e0c542be6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py @@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest): 'matmul_activation_mkldnn_fuse_pass', ], ) - yield config, ['matmul_v2'], (1e-5, 1e-5) + yield config, ['matmul'], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py index 42152e74da..b359d4a4c9 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py @@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest): config = self.create_inference_config( use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass'] ) - yield config, ['matmul_v2'], (1e-5, 1e-5) + yield config, ['matmul'], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py index 11e61fcf09..a2d2260683 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py @@ -116,7 +116,7 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest): def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul_v2"], (1e-5, 1e-5) + yield config, ["matmul"], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py index 41bc180053..2d368433ed 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py @@ -135,8 +135,17 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest): return program_config def sample_predictor_configs(self, program_config): + # gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op + fused_op = "matmul_v2" + input1_dim1 = program_config.inputs["input_data1"].shape[0] + input2_dim1 = program_config.inputs["input_data2"].shape[0] + input1_dim2 = program_config.inputs["input_data1"].shape[1] + input2_dim2 = program_config.inputs["input_data2"].shape[1] + if input1_dim1 == input2_dim1 and input1_dim2 == input2_dim2: + fused_op = "matmul" + config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul_v2"], (1e-5, 1e-5) + yield config, [fused_op], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py index 2ab7184237..fc4d800607 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py @@ -153,7 +153,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul_v2"], (1e-5, 1e-5) + yield config, ["matmul"], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py index 4304527a20..117af53670 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci class TestDnnlMatMulOp(OpTest): @@ -254,6 +254,321 @@ class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp): self.attrs = {'force_fp32_output': True} +@skip_check_grad_ci(reason="DNNL's MatMul doesn't implement grad kernel.") +class TestReshapeTransposeMatMulOp(OpTest): + def init_data_type(self): + self.data_type_ = 'float32' + + def generate_data(self): + self.x = ( + np.random.random([2, 128, 768]) + .astype("float32") + .reshape([2, 128, 12, 64]) + .transpose([0, 2, 1, 3]) + ) + self.y = ( + np.random.random([2, 128, 768]) + .astype("float32") + .reshape([2, 128, 12, 64]) + .transpose([0, 2, 1, 3]) + ) + self.out = np.matmul(self.x, self.y.transpose([0, 1, 3, 2])) + self.fused_reshape_X = [] + self.fused_transpose_X = [] + self.fused_reshape_Y = [] + self.fused_transpose_Y = [] + + def set_op_type_and_transpose_y_name(self): + self.op_type = "matmul" + self.transpose_y_name = "transpose_Y" + + def setUp(self): + self.set_op_type_and_transpose_y_name() + self._cpu_only = True + self.use_mkldnn = True + self.transpose_y = True + self.init_data_type() + self.generate_data() + + self.inputs = {'X': self.x, 'Y': self.y} + self.attrs = { + 'use_mkldnn': self.use_mkldnn, + self.transpose_y_name: self.transpose_y, + } + if len(self.fused_transpose_X) > 0: + self.attrs['fused_transpose_X'] = self.fused_transpose_X + if len(self.fused_transpose_Y) > 0: + self.attrs['fused_transpose_Y'] = self.fused_transpose_Y + if len(self.fused_reshape_X) > 0: + self.attrs['fused_reshape_X'] = self.fused_reshape_X + if len(self.fused_reshape_Y) > 0: + self.attrs['fused_reshape_Y'] = self.fused_reshape_Y + + self.outputs = {'Out': self.out} + + def test_check_output(self): + self.check_output() + + +class TestReshapeTransposeMatMulOp4DXFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = np.random.random([2, 128, 768]).astype("float32") + self.y = ( + np.random.random([2, 128, 768]) + .astype("float32") + .reshape([2, 128, 12, 64]) + .transpose([0, 2, 1, 3]) + ) + self.fused_transpose_X = [0, 2, 1, 3] + self.fused_reshape_X = [0, 0, 12, 64] + self.fused_transpose_Y = [] + self.fused_reshape_Y = [] + self.out = np.matmul( + self.x.reshape([2, 128, 12, 64]).transpose([0, 2, 1, 3]), + self.y.transpose([0, 1, 3, 2]), + ) + + +class TestReshapeTransposeMatMulOp4DXInt8(TestReshapeTransposeMatMulOp4DXFloat): + def init_data_type(self): + self.data_type_ = 'int8' + + +class TestReshapeTransposeMatMulOp4DYFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = ( + np.random.random([2, 128, 768]) + .astype("float32") + .reshape([2, 128, 12, 64]) + .transpose([0, 2, 1, 3]) + ) + self.y = np.random.random([2, 128, 768]).astype("float32") + self.fused_transpose_X = [] + self.fused_reshape_X = [] + self.fused_transpose_Y = [0, 2, 1, 3] + self.fused_reshape_Y = [0, 0, 12, 64] + self.out = np.matmul( + self.x, self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]) + ) + + +class TestReshapeTransposeMatMulOp4DYInt8(TestReshapeTransposeMatMulOp4DYFloat): + def init_data_type(self): + self.data_type_ = 'int8' + + +class TestReshapeTransposeMatMulOp4DXYFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = np.random.random([2, 128, 768]).astype("float32") + self.y = np.random.random([2, 128, 768]).astype("float32") + self.fused_transpose_X = [0, 2, 1, 3] + self.fused_reshape_X = [0, 0, 12, 64] + self.fused_transpose_Y = [0, 2, 1, 3] + self.fused_reshape_Y = [0, 0, 12, 64] + self.out = np.matmul( + self.x.reshape([2, 128, 12, 64]).transpose([0, 2, 1, 3]), + self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]), + ) + + +class TestReshapeTransposeMatMulOp4DXYInt8( + TestReshapeTransposeMatMulOp4DXYFloat +): + def init_data_type(self): + self.data_type_ = 'int8' + + +class TestReshapeTransposeMatMulOp2DXFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = np.random.random([2, 5, 10]).astype("float32") + self.y = ( + np.random.random([2, 5, 10]) + .astype("float32") + .reshape([10, 10]) + .transpose([1, 0]) + ) + self.fused_transpose_X = [1, 0] + self.fused_reshape_X = [10, 10] + self.fused_transpose_Y = [] + self.fused_reshape_Y = [] + self.out = np.matmul( + self.x.reshape([10, 10]).transpose([1, 0]), self.y.transpose([1, 0]) + ) + + +class TestReshapeTransposeMatMulOp2DXInt8(TestReshapeTransposeMatMulOp2DXFloat): + def init_data_type(self): + self.data_type_ = 'int8' + + +class TestReshapeTransposeMatMulOp2DYFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = ( + np.random.random([2, 5, 10]) + .astype("float32") + .reshape([10, 10]) + .transpose([1, 0]) + ) + self.y = np.random.random([2, 5, 10]).astype("float32") + self.fused_transpose_X = [] + self.fused_reshape_X = [] + self.fused_transpose_Y = [1, 0] + self.fused_reshape_Y = [10, 10] + self.out = np.matmul(self.x, self.y.reshape([10, 10])) + + +class TestReshapeTransposeMatMulOp2DYInt8(TestReshapeTransposeMatMulOp2DYFloat): + def init_data_type(self): + self.data_type_ = 'int8' + + +class TestReshapeTransposeMatMulOp3DXFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = np.random.random([2, 2, 5, 5]).astype("float32") + self.y = ( + np.random.random([2, 2, 5, 5]) + .astype("float32") + .reshape([2, 10, 5]) + .transpose([0, 2, 1]) + ) + self.fused_transpose_X = [0, 2, 1] + self.fused_reshape_X = [2, 10, 5] + self.fused_transpose_Y = [] + self.fused_reshape_Y = [] + self.out = np.matmul( + self.x.reshape([2, 10, 5]).transpose(0, 2, 1), + self.y.transpose(0, 2, 1), + ) + + +class TestReshapeTransposeMatMulOp3DXInt8(TestReshapeTransposeMatMulOp3DXFloat): + def init_data_type(self): + self.data_type_ = 'int8' + + +class TestReshapeTransposeMatMulOp3DYFloat(TestReshapeTransposeMatMulOp): + def generate_data(self): + self.x = ( + np.random.random([2, 2, 5, 5]) + .astype(self.data_type_) + .reshape([2, 10, 5]) + .transpose([0, 2, 1]) + ) + self.y = np.random.random([2, 2, 5, 5]).astype(self.data_type_) + self.fused_transpose_X = [] + self.fused_reshape_X = [] + self.fused_transpose_Y = [0, 2, 1] + self.fused_reshape_Y = [2, 10, 5] + self.out = np.matmul(self.x, self.y.reshape([2, 10, 5])) + + +class TestReshapeTransposeMatMulOp3DYInt8(TestReshapeTransposeMatMulOp3DYFloat): + def init_data_type(self): + self.data_type_ = 'int8' + + +@skip_check_grad_ci(reason="Tests inference only optimization.") +class TestMatMulOpTransposeReshapeEmptyFloat(OpTest): + def init_data_type(self): + self.data_type_ = np.float32 + + def generate_data(self): + self.bs = 1 + self.x = np.random.random([self.bs, 128, 128]).astype(self.data_type_) + self.y = np.random.random([self.bs, 128, 64]).astype(self.data_type_) + + def init_params_and_out(self): + self.transpose_out = [] + self.reshape_out = [] + self.out = np.matmul(self.x, self.y) + + def set_op_type(self): + self.op_type = "matmul" + + def setUp(self): + self.set_op_type() + self._cpu_only = True + self.use_mkldnn = True + self.init_data_type() + self.generate_data() + self.init_params_and_out() + + self.inputs = {'X': self.x, 'Y': self.y} + self.attrs = {'use_mkldnn': self.use_mkldnn} + + if len(self.reshape_out) > 0: + self.attrs['fused_reshape_Out'] = self.reshape_out + if len(self.transpose_out) > 0: + self.attrs['fused_transpose_Out'] = self.transpose_out + + self.inputs = {'X': self.x, 'Y': self.y} + self.outputs = {'Out': self.out} + + def test_check_output(self): + self.check_output() + + def check_raise_error(self, msg): + try: + self.check_output() + except Exception as e: + if msg in str(e): + raise AttributeError + else: + print(e) + + +class TestMatMulOpTransposeReshapeIntEmptyInt( + TestMatMulOpTransposeReshapeEmptyFloat +): + def init_data_type(self): + self.data_type_ = np.int8 + + +class TestMatMulOpTransposeReshapeBasicFloat( + TestMatMulOpTransposeReshapeEmptyFloat +): + def generate_data(self): + self.bs = 8 + self.x = np.random.random([self.bs, 12, 128, 128]).astype( + self.data_type_ + ) + self.y = np.random.random([self.bs, 12, 128, 64]).astype( + self.data_type_ + ) + + def init_params_and_out(self): + self.transpose_out = [0, 2, 1, 3] + self.reshape_out = [0, 0, self.x.shape[1] * self.y.shape[-1]] + self.out = ( + np.matmul(self.x, self.y) + .transpose([0, 2, 1, 3]) + .reshape([self.bs, -1, self.x.shape[1] * self.y.shape[-1]]) + ) + + +class TestMatMulOpTransposeReshapeBasicInt( + TestMatMulOpTransposeReshapeBasicFloat +): + def init_data_type(self): + self.data_type_ = np.int8 + + +class TestMatMulOpTransposeReshapeOtherDimFloat( + TestMatMulOpTransposeReshapeBasicFloat +): + def generate_data(self): + self.bs = 11 + self.x = np.random.random([self.bs, 12, 14, 18]).astype(self.data_type_) + self.y = np.random.random([self.bs, 12, 18, 13]).astype(self.data_type_) + + +class TestMatMulOpTransposeReshapeOtherDimInt( + TestMatMulOpTransposeReshapeOtherDimFloat +): + def init_data_type(self): + self.data_type_ = np.int8 + + if __name__ == "__main__": from paddle import enable_static -- GitLab