diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc index 07a608c5a2b4a8e48830281727e288bbc2c9d5dc..16bdfe6b05ccf07566df3a9106f937691b84cf6c 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc @@ -77,6 +77,16 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct( ? "gelu_tanh" : "gelu_erf"; } + + if (matmul_type == "matmul") { + matmul_op->SetType("matmul_v2"); + matmul_op->SetAttr("trans_x", matmul_op->GetAttr("transpose_X")); + matmul_op->SetAttr("trans_y", matmul_op->GetAttr("transpose_Y")); + auto matmul_alpha = matmul_op->GetAttrIfExists("alpha"); + if (matmul_alpha != 1.0f) { + matmul_op->SetAttr("alpha", matmul_alpha); + } + } matmul_op->SetAttr("fuse_activation", act_type); matmul_op->SetOutput("Out", {activation_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc index f045377465e0322207d2d5ebdb888f74878e8d43..5590e5113e742c3101c4a172efa70c9e41d05455 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc @@ -65,6 +65,16 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd( return; } + if (matmul_type == "matmul") { + matmul->Op()->SetType("matmul_v2"); + matmul->Op()->SetAttr("trans_x", matmul->Op()->GetAttr("transpose_X")); + matmul->Op()->SetAttr("trans_y", matmul->Op()->GetAttr("transpose_Y")); + auto matmul_alpha = matmul->Op()->GetAttrIfExists("alpha"); + if (matmul_alpha != 1.0f) { + matmul->Op()->SetAttr("alpha", matmul_alpha); + } + } + matmul->Op()->SetInput("ResidualData", {elementwise_addend->Name()}); matmul->Op()->SetOutput("Out", {elementwise_add_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc index 40dbaa03a0615f1456c6530ed1340741d443f193..1d70722f7363624a900c8d91ca5fc05b05a08333 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc @@ -84,6 +84,15 @@ void MatmulTransposeReshapeMKLDNNPass::Fuse( } OpDesc *matmul_desc = matmul_op->Op(); + if (matmul_type == "matmul") { + matmul_desc->SetType("matmul_v2"); + matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X")); + matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y")); + auto matmul_alpha = matmul_desc->GetAttrIfExists("alpha"); + if (matmul_alpha != 1.0f) { + matmul_desc->SetAttr("alpha", matmul_alpha); + } + } matmul_desc->SetOutput("Out", {reshape_out->Name()}); matmul_desc->SetAttr("fused_reshape_Out", reshape_shape); matmul_desc->SetAttr("fused_transpose_Out", transpose_axis); diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc index cb06f6eb1205e94d0a1861183014edfc1a67de02..f28159a76aa69365e884981b487f654804f0bf24 100644 --- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc @@ -85,6 +85,17 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph, scale = *(scale_tensor->data()); } + if (op_type == "matmul") { + operator_op->Op()->SetType("matmul_v2"); + operator_op->Op()->SetAttr("trans_x", + operator_op->Op()->GetAttr("transpose_X")); + operator_op->Op()->SetAttr("trans_y", + operator_op->Op()->GetAttr("transpose_Y")); + auto matmul_alpha = operator_op->Op()->GetAttrIfExists("alpha"); + if (matmul_alpha != 1.0f) { + operator_op->Op()->SetAttr("alpha", matmul_alpha); + } + } operator_op->Op()->SetAttr("fused_output_scale", scale); operator_op->Op()->SetOutput("Out", {scale_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index 25a79509b53f531ce53cd354bea1e16f9680f5c0..4d26190a503f214740372e689089c0805795eda8 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -123,6 +123,15 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( return; } + if (matmul_type == "matmul") { + matmul_desc->SetType("matmul_v2"); + matmul_desc->SetAttr("trans_x", matmul_desc->GetAttr("transpose_X")); + matmul_desc->SetAttr("trans_y", matmul_desc->GetAttr("transpose_Y")); + auto matmul_alpha = matmul_desc->GetAttrIfExists("alpha"); + if (matmul_alpha != 1.0f) { + matmul_desc->SetAttr("alpha", matmul_alpha); + } + } matmul_desc->SetInput(matmul_input_name, {(reshape_in)->Name()}); matmul_desc->SetAttr("fused_reshape_" + matmul_input_name, reshape_shape); matmul_desc->SetAttr("fused_transpose_" + matmul_input_name, diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc index 2dd13573d98a054167db0a7686d106fb151af605..e5bba1a38f0a5e08fffeca2f679356177c2e4cc6 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc @@ -97,7 +97,7 @@ void TestMain(const std::string& op_name, bool with_xshapes) { int removed = 8; // 2* reshape, reshape_out, transpose, transpose_out if (with_xshapes) removed += 2; // transpose_xshape, reshape_xshape EXPECT_EQ(total_nodes_before - removed, total_nodes_after); - auto* matmul_op_desc = GetOpNodes(graph, op_name).at(0)->Op(); + auto* matmul_op_desc = GetOpNodes(graph, "matmul_v2").at(0)->Op(); auto check = [&matmul_op_desc](std::string a) { std::string shape_str = "fused_reshape_" + a; diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 64db8598150c061b203f2c150f28231c2fdb90f3..cba18b3cdb2261df50fd3a9cf13ca44c11d09ff8 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -345,26 +345,6 @@ class MatMulGradKernel : public framework::OpKernel { } }; -framework::DDim GetDimForInput(const framework::InferShapeContext &ctx, - std::string input_name) { - auto shape = ctx.Attrs().Get>("fused_reshape_" + input_name); - auto axis = - ctx.Attrs().Get>("fused_transpose_" + input_name); - auto dim = ctx.GetInputDim(input_name); - - PADDLE_ENFORCE_GT(dim.size(), - 0, - platform::errors::InvalidArgument( - "The Input(%s) has not been initialized properly. The " - "shape of Input(%s) = [%s].", - dim)); - - if (!shape.empty() && !axis.empty()) { - dim = dim.reshape(shape).transpose(axis); - } - return dim; -} - template class MatMulDoubleGradKernel : public framework::OpKernel { public: @@ -579,8 +559,8 @@ class MatMulOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul"); OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "matmul"); - auto dim_x = GetDimForInput(*context, "X"); - auto dim_y = GetDimForInput(*context, "Y"); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); #ifdef PADDLE_WITH_MKLDNN // (jczaja): For NHWC execution output shape needs @@ -681,14 +661,6 @@ class MatMulOp : public framework::OperatorWithKernel { framework::DDim ddim_out = phi::make_ddim(dim_out); -#ifdef PADDLE_WITH_MKLDNN - auto shape = context->Attrs().Get>("fused_reshape_Out"); - auto axis = context->Attrs().Get>("fused_transpose_Out"); - - if (!shape.empty() && !axis.empty()) { - ddim_out = ddim_out.transpose(axis).reshape(shape); - } -#endif context->SetOutputDim("Out", ddim_out); context->ShareLoD("X", "Out"); } diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd204e8cacfaf15e9189ca19dded05c113011164 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -0,0 +1,630 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace { +using dnnl::memory; +using paddle::framework::ExecutionContext; +using paddle::framework::GradVarName; +using phi::DenseTensor; +using phi::OneDNNContext; +using phi::vectorize; +using phi::funcs::OneDNNGetDataType; + +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +static DenseTensor FoldOuterDims(const DenseTensor &input) { + auto output = input; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +static DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx, + const DenseTensor *input) { + auto input_dims = vectorize(input->dims()); + if (input_dims.size() != 3) { + return *input; + } + + DenseTensor output; + output.Resize({input_dims[1], input_dims[0], input_dims[2]}); + + auto output_dims = vectorize(output.dims()); + + memory::data_type input_type = phi::funcs::ToOneDNNDataType(input->dtype()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + output_dims, input->dtype(), input_type, dev_ctx.GetEngine()); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + memory::format_tag::abc, phi::funcs::to_void_cast(input->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + &output, memory::format_tag::bac, dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, + reorder_dst_memory_p); + + auto &astream = OneDNNContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); + return output; +} + +template +class MatMulV1OneDNNHandler + : public phi::funcs::OneDNNHandlerNoCachingT { + public: + MatMulV1OneDNNHandler(const ExecutionContext &ctx, + const dnnl::engine engine, + phi::Place cpu_place, + const std::vector &x_org_dims, + const std::vector &y_org_dims) + : phi::funcs::OneDNNHandlerNoCachingT(engine, + cpu_place) { + // M X K * K X N + std::vector x_dims(x_org_dims); + std::vector y_dims(y_org_dims); + + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + auto trans_x = ctx.Attr("transpose_X"); + auto trans_y = ctx.Attr("transpose_Y"); + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (trans_x) { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } else { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } + + if (trans_y) { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } else { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + auto x_md = + memory::desc(x_dims, phi::funcs::OneDNNGetDataType(), x_strides); + auto y_md = + memory::desc(y_dims, phi::funcs::OneDNNGetDataType(), y_strides); + auto out_md = memory::desc( + out_ddims, phi::funcs::OneDNNGetDataType(), out_strides); + + dnnl::primitive_attr matmul_attrs; + dnnl::post_ops post_operations; + float scale_out = ComputeOutputScale(ctx); + if (scale_out != 1.0f) { + matmul_attrs.set_output_scales(0, {scale_out}); + } + matmul_attrs.set_post_ops(post_operations); + + this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md); + } + + MatMulV1OneDNNHandler(const dnnl::engine engine, + phi::Place cpu_place, + DenseTensor *x, + bool trans_x, + DenseTensor *y, + bool trans_y, + DenseTensor *out, + float scale) + : phi::funcs::OneDNNHandlerNoCachingT(engine, + cpu_place) { + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x); + auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y); + + memory::dim x_bs = mat_dim_x.batch_size_; + memory::dim y_bs = mat_dim_y.batch_size_; + + memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; + const memory::dim M = mat_dim_x.height_; + const memory::dim N = mat_dim_y.width_; + const memory::dim K = mat_dim_x.width_; + + memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; + memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; + memory::dims out_dims = {out_bs, M, N}; + + memory::dims x_strides = + trans_x ? memory::dims{M * K, 1, M} : memory::dims{M * K, K, 1}; + + memory::dims y_strides = + trans_y ? memory::dims{N * K, 1, K} : memory::dims{N * K, N, 1}; + memory::dims out_strides = memory::dims{M * N, N, 1}; + + auto x_md = memory::desc(x_dims, OneDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, OneDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_dims, OneDNNGetDataType(), out_strides); + + dnnl::primitive_attr attrs; + if (scale != 1.0f) attrs.set_output_scales(0, {scale}); + + this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); + } + + float ComputeOutputScale(const ExecutionContext &ctx) { + float alpha = ctx.Attr("alpha"); + if (ctx.HasAttr("Scale_x") && ctx.HasAttr("Scale_y") && + ctx.HasAttr("Scale_out")) { + float scale_x = ctx.Attr("Scale_x"); + float scale_y = ctx.Attr("Scale_y"); + bool force_fp32_out = ctx.HasAttr("force_fp32_output") + ? ctx.Attr("force_fp32_output") + : false; + float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); + alpha *= scale_out / (scale_x * scale_y); + } + return alpha; + } + + std::shared_ptr AcquireWeightsMemory(const DenseTensor *input) { + const YT *input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), + phi::funcs::to_void_cast(input_data)); + } + + std::shared_ptr AcquireDstMemory(DenseTensor *output) { + // We cannot use base AcquireDstMemory as it makes an allocation request + // base on DST memory primitive size. This is fine in general, but in MatMul + // we have primitive that covers only one batch of Data and then shift + // pointer for every new batch. Hence DenseTensor size is bigger that + // dst memory primitive size. So would we request less memory that is there + // and it triggers an assertion. So as there is no 'any' format here we can + // leave default size of DenseTensor as computed in ComputeInferShape + OT *ptr = output->mutable_data(this->place_); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); + } + + private: + uint16_t batch_size_; +}; + +/** + * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. + * + * The shape would be [BatchSize, H, W] or [H, W]. + * If transposed, `H,W` will be swapped. + */ +static void ReshapeTensorToMatrixSequence( + DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) { + int64_t h, w; + h = descriptor.height_; + w = descriptor.width_; + if (descriptor.trans_) { + std::swap(w, h); + } + if (descriptor.batch_size_) { + x->Resize({descriptor.batch_size_, h, w}); + } else { + x->Resize({h, w}); + } +} + +/** + * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor + * Out = matmul(x, y) + * + * This method will first calculate X,Y matrix sequence, and then calculate + * the out shape. + * + * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] + * The out = [BatchSize, H1, W2] + * + * If there is no batch size in `X` and `Y`, the out will be [H1, W2] + * If any of `X` and `Y` has batch size BatchSize, the out will have the + * BatchSize. + */ +static void ReshapeXYOutToMatrixSequence(DenseTensor *x, + DenseTensor *y, + DenseTensor *out, + bool trans_x, + bool trans_y) { + auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims()); + auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims()); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); + auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); + if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { + out->Resize({mat_dim_x.height_, mat_dim_y.width_}); + } else { + out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), + mat_dim_x.height_, + mat_dim_y.width_}); + } + + ReshapeTensorToMatrixSequence(x, mat_dim_x); + ReshapeTensorToMatrixSequence(y, mat_dim_y); +} + +std::vector Transpose(const std::vector &x, + const std::vector &axis) { + size_t in_rank = x.size(); + size_t axis_size = axis.size(); + + auto axis_set = std::set(axis.begin(), axis.end()); + PADDLE_ENFORCE_EQ(axis_set.size(), + axis_size, + phi::errors::InvalidArgument( + "In an axis array, elements must be unique.")); + + PADDLE_ENFORCE_EQ( + in_rank, + axis_size, + phi::errors::InvalidArgument("The input dimension's size " + "should be equal to the axis's size. " + "But received dimension is %d, " + "axis's size is %d", + in_rank, + axis_size)); + + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), + axis_size, + phi::errors::InvalidArgument( + "Axis values must be ranging from 0 to (dims - 1).")); + + std::vector new_x(x.size()); + for (size_t i = 0; i < x.size(); i++) { + new_x[i] = x[axis[i]]; + } + return new_x; +} + +template +void ExecuteMatMul(const ExecutionContext &ctx, + const DenseTensor *x, + const std::vector &x_dims, + const DenseTensor *y, + const std::vector &y_dims, + DenseTensor *out) { + const auto &dev_ctx = ctx.template device_context(); + MatMulV1OneDNNHandler handler( + ctx, dev_ctx.GetEngine(), ctx.GetPlace(), x_dims, y_dims); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = OneDNNContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); +} + +template +class MatMulV1OneDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), + 1, + phi::errors::Unimplemented( + "oneDNN matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + constexpr bool is_int8 = phi::funcs::is_int8(); + constexpr bool is_bfloat16 = phi::funcs::is_bfloat16(); + const bool force_fp32_output = ctx.HasAttr("force_fp32_output") + ? ctx.Attr("force_fp32_output") + : false; + constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses + + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *out = ctx.Output("Out"); + + auto x_dims = vectorize(x->dims()); + auto y_dims = vectorize(y->dims()); + + int ndims = std::max(x_dims.size(), y_dims.size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + CalculateMatrixDims(x_dims, y_dims, &x_bd_dims, &y_bd_dims, out); + + if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { + ExecuteMatMul(ctx, x, x_bd_dims, y, y_bd_dims, out); + } else if (is_bfloat16) { + ExecuteMatMul( + ctx, x, x_bd_dims, y, y_bd_dims, out); + } else if (fuse_relu) { + ExecuteMatMul(ctx, x, x_bd_dims, y, y_bd_dims, out); + } else { + ExecuteMatMul(ctx, x, x_bd_dims, y, y_bd_dims, out); + } + } + + private: + void CalculateMatrixDims(const std::vector &x_dims, + const std::vector &y_dims, + std::vector *x_bd_dims, + std::vector *y_bd_dims, + DenseTensor *out) const { + if (x_dims.size() == 1) { + (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0]; + } else if (x_dims.size() == 2) { + (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1]; + (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0]; + } else { + for (size_t i = 0; i < x_dims.size(); ++i) { + (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i]; + } + } + if (y_dims.size() == 1) { + (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0]; + } else if (y_dims.size() == 2) { + (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1]; + (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0]; + } else { + for (size_t i = 0; i < y_dims.size(); ++i) { + (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i]; + } + } + + if (x_dims.size() > 2 && y_dims.size() > 2) { + auto out_dims = vectorize(out->dims()); + for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) { + PADDLE_ENFORCE_EQ( + (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 || + (*y_bd_dims)[i] == 1, + true, + phi::errors::InvalidArgument( + "DenseTensor dimensions are incorrect for broadcasting." + "Dimensions in X and Y must be same or equal to 1, but " + "received x_dim[%d]=%d and y_dims[%d]= %d", + i, + (*x_bd_dims)[i], + i, + (*y_bd_dims)[i])); + (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); + } + out->Resize(phi::make_ddim((out_dims))); + } + } +}; + +template +class MatMulV1GradOneDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), + 1, + phi::errors::Unimplemented( + "oneDNN matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + auto x = *ctx.Input("X"); + auto y = *ctx.Input("Y"); + auto dout = *ctx.Input(paddle::framework::GradVarName("Out")); + auto *dx = ctx.Output(paddle::framework::GradVarName("X")); + auto *dy = ctx.Output(paddle::framework::GradVarName("Y")); + + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + + ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + + phi::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } + + phi::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } + + if (transpose_x && transpose_y) { + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &y, true, true, &dout, true, false, dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &dout, true, true, &x, true, false, dy); + } else if (transpose_x) { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &y, + false, + false, + &dout, + true, + false, + dx); + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &x, + false, + false, + &dout, + false, + true, + dy); + } else if (transpose_y) { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &dout, + false, + false, + &y, + false, + true, + dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &dout, true, true, &x, false, true, dy); + } else { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &dout, + false, + false, + &y, + true, + false, + dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &x, true, true, &dout, false, true, dy); + } + + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + dx->set_mem_desc(x.mem_desc()); + } + } + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + dy->set_mem_desc(y.mem_desc()); + } + } + } + + private: + void ExecuteMatMulGrad(const ExecutionContext &ctx, + const OneDNNContext &dev_ctx, + const dnnl::engine &engine, + DenseTensor *x, + bool trans_x, + bool is_fold_init_dims_x, + DenseTensor *y, + bool trans_y, + bool is_fold_init_dims_y, + DenseTensor *out) const { + // gradient is calculated in a different way when broadcasting is used + bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && + out->dims().size() == 2; + + DenseTensor x_combined, y_combined; + if (need_combine) { + x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) + : FoldFirstAndLastDims(dev_ctx, x); + y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) + : FoldFirstAndLastDims(dev_ctx, y); + } else { + x_combined = *x; + y_combined = *y; + } + + float alpha = ctx.Attr("alpha"); + + MatMulV1OneDNNHandler handler(engine, + ctx.GetPlace(), + &x_combined, + trans_x, + &y_combined, + trans_y, + out, + alpha); + + const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = OneDNNContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); + } +}; + +} // anonymous namespace + +REGISTER_OP_KERNEL(matmul, + MKLDNN, + ::phi::CPUPlace, + MatMulV1OneDNNKernel, + MatMulV1OneDNNKernel, + MatMulV1OneDNNKernel, + MatMulV1OneDNNKernel); + +REGISTER_OP_KERNEL(matmul_grad, + MKLDNN, + ::phi::CPUPlace, + MatMulV1GradOneDNNKernel, + MatMulV1GradOneDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc deleted file mode 100644 index 7bf66bae93cc3e0659dce090415bac35c357e462..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ /dev/null @@ -1,941 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/phi/backends/onednn/onednn_reuse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace { -using dnnl::memory; -using paddle::framework::ExecutionContext; -using paddle::framework::GradVarName; -using phi::OneDNNContext; -using phi::vectorize; -using phi::funcs::OneDNNGetDataType; - -// Reshape a rank-3 tensor from P x M x N to (P * M) x N. -// Identity op if the tensor is not of rank 3. -static phi::DenseTensor FoldOuterDims(const phi::DenseTensor &input) { - auto output = input; - auto in_dims = input.dims(); - if (in_dims.size() == 3) { - output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); - } - return output; -} - -// Reshape a rank-3 tensor from P x M x N to M x (P * N). -// (Warning: This requires transposing data and writes into new memory.) -// Identity op if the tensor is not of rank 3. -template -static phi::DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx, - const phi::DenseTensor *input) { - auto input_dims = vectorize(input->dims()); - if (input_dims.size() != 3) { - return *input; - } - - phi::DenseTensor output; - output.Resize({input_dims[1], input_dims[0], input_dims[2]}); - - auto output_dims = vectorize(output.dims()); - - memory::data_type input_type = phi::funcs::ToOneDNNDataType(input->dtype()); - phi::funcs::ReorderOneDNNHandler reorder_handler( - output_dims, input->dtype(), input_type, dev_ctx.GetEngine()); - - auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - memory::format_tag::abc, phi::funcs::to_void_cast(input->data())); - auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( - &output, memory::format_tag::bac, dev_ctx.GetPlace()); - auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, - reorder_dst_memory_p); - - auto &astream = OneDNNContext::tls().get_stream(); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); - return output; -} - -phi::DDim GetDimForInput(const ExecutionContext &ctx, std::string input_name) { - auto shape = ctx.Attr>("fused_reshape_" + input_name); - auto axis = ctx.Attr>("fused_transpose_" + input_name); - auto input_dims = ctx.Input(input_name)->dims(); - if (!shape.empty() && !axis.empty()) { - return input_dims.reshape(shape).transpose(axis); - } - return input_dims; -} - -template -class MatMulV2MKLDNNHandler - : public phi::funcs::OneDNNHandlerNoCachingT { - public: - MatMulV2MKLDNNHandler(const ExecutionContext &ctx, - const dnnl::engine engine, - paddle::platform::Place cpu_place, - const std::vector &x_org_dims, - bool trans_x, - const std::vector &y_org_dims, - bool trans_y, - bool is_output_fused, - const std::vector &x_strides_override, - const std::vector &y_strides_override) - : phi::funcs::OneDNNHandlerNoCachingT(engine, - cpu_place) { - // M X K * K X N - std::vector x_dims(x_org_dims); - std::vector y_dims(y_org_dims); - - const int MB_idx = x_dims.size() - 3; - const int H_idx = x_dims.size() - 2; - const int W_idx = x_dims.size() - 1; - - if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); - if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); - - const memory::dim M = x_dims[H_idx]; - const memory::dim K = x_dims[W_idx]; - const memory::dim N = y_dims[W_idx]; - - std::vector x_strides(x_dims.size() - 3, 1); - std::vector y_strides(x_dims.size() - 3, 1); - std::vector out_strides(x_dims.size() - 3, 1); - std::vector out_ddims(x_dims.size() - 3, 1); - - x_strides.reserve(x_dims.size()); - y_strides.reserve(x_dims.size()); - out_strides.reserve(x_dims.size()); - - if (!x_strides_override.empty()) { - x_strides = x_strides_override; - } else { - if (!trans_x) { - x_strides.insert(x_strides.end(), {M * K, K, 1}); - } else { - x_strides.insert(x_strides.end(), {M * K, 1, M}); - } - } - - if (!y_strides_override.empty()) { - y_strides = y_strides_override; - } else { - if (!trans_y) { - y_strides.insert(y_strides.end(), {N * K, N, 1}); - } else { - y_strides.insert(y_strides.end(), {N * K, 1, K}); - } - } - - out_strides.insert(out_strides.end(), {M * N, N, 1}); - out_ddims.insert(out_ddims.end(), - {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); - - for (int i = x_dims.size() - 4; i >= 0; --i) { - out_ddims[i] = std::max(x_dims[i], y_dims[i]); - if (x_strides_override.empty()) { - x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; - } - if (y_strides_override.empty()) { - y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; - } - out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; - } - - // TODO(jczaja): Why not for int8?? - if (!phi::funcs::is_int8() && is_output_fused) { - out_strides = FakeTransposeStrides(out_ddims); - } - - auto x_md = - memory::desc(x_dims, phi::funcs::OneDNNGetDataType(), x_strides); - auto y_md = - memory::desc(y_dims, phi::funcs::OneDNNGetDataType(), y_strides); - auto out_md = memory::desc( - out_ddims, phi::funcs::OneDNNGetDataType(), out_strides); - - const dnnl::primitive_attr matmul_attrs = CreateMatmulAttrs(ctx); - - this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md); - } - - void AppendActivation(const ExecutionContext &ctx, - dnnl::post_ops &post_ops, // NOLINT - float activation_scale = 1.0f) { - const auto invalid_attribute = - ctx.HasAttr("fuse_activation") - ? ctx.Attr("fuse_activation").empty() - : true; - if (invalid_attribute) return; - - const auto fuse_activation = ctx.Attr("fuse_activation"); - const auto fuse_alpha = - ctx.HasAttr("fuse_alpha") ? ctx.Attr("fuse_alpha") : 0.0f; - const auto fuse_beta = - ctx.HasAttr("fuse_beta") ? ctx.Attr("fuse_beta") : 0.0f; - - if (fuse_activation == "hard_sigmoid") { - post_ops.append_eltwise(activation_scale, - dnnl::algorithm::eltwise_linear, - fuse_alpha, - fuse_beta); - post_ops.append_eltwise( - activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); - } else { - const std::unordered_map activation_map = { - {"abs", dnnl::algorithm::eltwise_abs}, - {"clip", dnnl::algorithm::eltwise_clip}, - {"gelu", dnnl::algorithm::eltwise_gelu_erf}, - {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, - {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, - {"hard_swish", dnnl::algorithm::eltwise_hardswish}, - {"leaky_relu", dnnl::algorithm::eltwise_relu}, - {"mish", dnnl::algorithm::eltwise_mish}, - {"relu", dnnl::algorithm::eltwise_relu}, - {"relu6", dnnl::algorithm::eltwise_bounded_relu}, - {"sigmoid", dnnl::algorithm::eltwise_logistic}, - {"sqrt", dnnl::algorithm::eltwise_sqrt}, - {"swish", dnnl::algorithm::eltwise_swish}, - {"tanh", dnnl::algorithm::eltwise_tanh}}; - - const auto &activation_type = activation_map.find(fuse_activation); - - PADDLE_ENFORCE_NE( - activation_type, - activation_map.end(), - phi::errors::InvalidArgument( - "Activation '%s' not found in oneDNN algorithms mapper", - fuse_activation)); - - post_ops.append_eltwise( - activation_scale, activation_type->second, fuse_alpha, fuse_beta); - } - } - - float ComputeOutputScale(const ExecutionContext &ctx) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; - if (ctx.HasAttr("Scale_x") && ctx.HasAttr("Scale_y") && - ctx.HasAttr("Scale_out")) { - float scale_x = ctx.Attr("Scale_x"); - float scale_y = ctx.Attr("Scale_y"); - bool force_fp32_out = ctx.HasAttr("force_fp32_output") - ? ctx.Attr("force_fp32_output") - : false; - float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); - alpha *= scale_out / (scale_x * scale_y); - } - return alpha; - } - - dnnl::primitive_attr CreateMatmulAttrs(const ExecutionContext &ctx) { - dnnl::primitive_attr matmul_attrs; - dnnl::post_ops post_operations; - - float scale_out = ComputeOutputScale(ctx); - if (scale_out != 1.0f) { - matmul_attrs.set_output_scales(0, {scale_out}); - } - - if (ctx.HasInput("ResidualData")) { - auto *residual_data = ctx.Input("ResidualData"); - auto residual_data_tz = phi::vectorize(residual_data->dims()); - auto residual_data_md = memory::desc(residual_data_tz, - phi::funcs::OneDNNGetDataType(), - dnnl::memory::format_tag::any); - post_operations.append_binary(dnnl::algorithm::binary_add, - residual_data_md); - if (ctx.HasAttr("Scale_in_eltwise")) { - float sum_scale = scale_out / ctx.Attr("Scale_in_eltwise"); - post_operations.append_sum(sum_scale); - } - } - - AppendActivation(ctx, post_operations); - - if (ctx.HasAttr("fused_output_scale")) { - float scale_alpha = ctx.Attr("fused_output_scale"); - post_operations.append_eltwise( - 1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f); - } - - matmul_attrs.set_post_ops(post_operations); - return matmul_attrs; - } - - std::vector FakeTransposeStrides( - const std::vector &matmul_out_dims) const { - // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and - // transpose axis are: {0, 2, 1, 3} - std::vector transpose_axis = {0, 2, 1, 3}; - std::vector fake_strides(transpose_axis.size()); - int ndims = static_cast(transpose_axis.size()); - - int total_stride = 1; - - for (int i = ndims - 1; i >= 0; --i) { - fake_strides[transpose_axis[i]] = total_stride; - total_stride *= matmul_out_dims[transpose_axis[i]]; - } - - return fake_strides; - } - - std::shared_ptr AcquireWeightsMemory(const phi::DenseTensor *input) { - const YT *input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->weights_desc(), - phi::funcs::to_void_cast(input_data)); - } - - std::shared_ptr AcquireDstMemory(phi::DenseTensor *output) { - // We cannot use base AcquireDstMemory as it makes an allocation request - // base on DST memory primitive size. This is fine in general, but in MatMul - // we have primitive that covers only one batch of Data and then shift - // pointer for every new batch. Hence phi::DenseTensor size is bigger that - // dst memory primitive size. So would we request less memory that is there - // and it triggers an assertion. So as there is no 'any' format here we can - // leave default size of phi::DenseTensor as computed in ComputeInferShape - OT *ptr = output->mutable_data(this->place_); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); - } -}; - -template -class MatMulMKLDNNHandler - : public phi::funcs::OneDNNHandlerNoCachingT { - public: - MatMulMKLDNNHandler(const dnnl::engine engine, - paddle::platform::Place cpu_place, - phi::DenseTensor *x, - bool trans_x, - phi::DenseTensor *y, - bool trans_y, - phi::DenseTensor *out, - float scale) - : phi::funcs::OneDNNHandlerNoCachingT(engine, - cpu_place) { - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y); - - memory::dim x_bs = mat_dim_x.batch_size_; - memory::dim y_bs = mat_dim_y.batch_size_; - - memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; - const memory::dim M = mat_dim_x.height_; - const memory::dim N = mat_dim_y.width_; - const memory::dim K = mat_dim_x.width_; - - memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; - memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; - memory::dims out_dims = {out_bs, M, N}; - - memory::dims x_strides = - !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M}; - - memory::dims y_strides = - !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; - memory::dims out_strides = memory::dims{M * N, N, 1}; - - auto x_md = memory::desc(x_dims, OneDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, OneDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_dims, OneDNNGetDataType(), out_strides); - - dnnl::primitive_attr attrs; - if (scale != 1.0f) attrs.set_output_scales(0, {scale}); - - this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); - } - - std::shared_ptr AcquireWeightsMemory(const phi::DenseTensor *input) { - const YT *input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->weights_desc(), - phi::funcs::to_void_cast(input_data)); - } - - public: - void Execute(const phi::DenseTensor *x, - const phi::DenseTensor *y, - phi::DenseTensor *out) { - const auto src_memory_p = this->AcquireSrcMemory(x); - const auto weights_memory_p = this->AcquireWeightsMemory(y); - const auto dst_memory_p = this->AcquireDstMemory(out); - - auto matmul_p = this->AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - auto &astream = OneDNNContext::tls().get_stream(); - - // Simulate batch matmul by processing in loop - void *x_ptr = src_memory_p->get_data_handle(); - void *y_ptr = weights_memory_p->get_data_handle(); - void *out_ptr = dst_memory_p->get_data_handle(); - auto offsets = std::make_tuple(x_offset_, y_offset_, out_offset_); - for (uint16_t i = 0; i < batch_size_; ++i) { - src_memory_p->set_data_handle(x_ptr); - weights_memory_p->set_data_handle(y_ptr); - dst_memory_p->set_data_handle(out_ptr); - matmul_p->execute(astream, matmul_args); - x_ptr = static_cast(x_ptr) + std::get<0>(offsets); - y_ptr = static_cast(y_ptr) + std::get<1>(offsets); - out_ptr = static_cast(out_ptr) + std::get<2>(offsets); - } - astream.wait(); - - out->set_mem_desc(dst_memory_p->get_desc().reshape(out->dims())); - } - - std::shared_ptr AcquireDstMemory(phi::DenseTensor *output) { - // We cannot use base AcquireDstMemory as it makes an allocation request - // base on DST memory primitive size. This is fine in general, but in MatMul - // we have primitive that covers only one batch of Data and then shift - // pointer for every new batch. Hence phi::DenseTensor size is bigger that - // dst memory primitive size. So would we request less memory that is there - // and it triggers an assertion. So as there is no 'any' format here we can - // leave default size of phi::DenseTensor as computed in ComputeInferShape - OT *ptr = output->mutable_data(this->place_); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); - } - - private: - uint32_t x_offset_; - uint32_t y_offset_; - uint32_t out_offset_; - uint16_t batch_size_; -}; - -/** - * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. - * - * The shape would be [BatchSize, H, W] or [H, W]. - * If transposed, `H,W` will be swapped. - */ -static void ReshapeTensorToMatrixSequence( - phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) { - int64_t h, w; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - if (descriptor.batch_size_) { - x->Resize({descriptor.batch_size_, h, w}); - } else { - x->Resize({h, w}); - } -} - -/** - * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor - * Out = matmul(x, y) - * - * This method will first calculate X,Y matrix sequence, and then calculate - * the out shape. - * - * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] - * The out = [BatchSize, H1, W2] - * - * If there is no batch size in `X` and `Y`, the out will be [H1, W2] - * If any of `X` and `Y` has batch size BatchSize, the out will have the - * BatchSize. - */ -static void ReshapeXYOutToMatrixSequence(phi::DenseTensor *x, - phi::DenseTensor *y, - phi::DenseTensor *out, - bool trans_x, - bool trans_y) { - auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims()); - auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims()); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); - if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { - out->Resize({mat_dim_x.height_, mat_dim_y.width_}); - } else { - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, - mat_dim_y.width_}); - } - - ReshapeTensorToMatrixSequence(x, mat_dim_x); - ReshapeTensorToMatrixSequence(y, mat_dim_y); -} - -std::vector Transpose(const std::vector &x, - const std::vector &axis) { - size_t in_rank = x.size(); - size_t axis_size = axis.size(); - - auto axis_set = std::set(axis.begin(), axis.end()); - PADDLE_ENFORCE_EQ(axis_set.size(), - axis_size, - paddle::platform::errors::InvalidArgument( - "In an axis array, elements must be unique.")); - - PADDLE_ENFORCE_EQ(in_rank, - axis_size, - paddle::platform::errors::InvalidArgument( - "The input dimension's size " - "should be equal to the axis's size. " - "But received dimension is %d, " - "axis's size is %d", - in_rank, - axis_size)); - - PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), - axis_size, - paddle::platform::errors::InvalidArgument( - "Axis values must be ranging from 0 to (dims - 1).")); - - std::vector new_x(x.size()); - for (size_t i = 0; i < x.size(); i++) { - new_x[i] = x[axis[i]]; - } - return new_x; -} - -std::vector GetInputStrides(const ExecutionContext &ctx, - const std::string input_name) { - auto shape = ctx.Attr>("fused_reshape_" + input_name); - auto axis = ctx.Attr>("fused_transpose_" + input_name); - auto input_dims = ctx.Input(input_name)->dims(); - auto new_dims = input_dims; - if (!shape.empty() && !axis.empty()) { - new_dims = input_dims.reshape(shape).transpose(axis); - } - - auto &MatrixDimsFromVector = input_name == "X" - ? phi::funcs::RowMatrixDimsFromVector - : phi::funcs::ColumnMatrixDimsFromVector; - phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( - MatrixDimsFromVector(new_dims), - 0, - ctx.HasAttr("trans_x") - ? ctx.Attr(std::string("trans_") + - static_cast(std::tolower(input_name[0]))) - : ctx.Attr(std::string("transpose_") + input_name[0])); - - std::vector strides; - if (!shape.empty()) { - auto shape2 = input_dims.reshape(shape); - strides.push_back(1); - for (auto i = shape2.size() - 1; i > 0; --i) { - strides.insert(strides.begin(), - strides.front() * static_cast(shape2[i])); - } - strides = Transpose(strides, axis); - if (shape.size() == 2) - strides.insert(strides.begin(), - static_cast(shape[0] * shape[1])); - mat_dim.stride_ = strides[0]; - if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin())); - } - return strides; -} - -bool IsOutputFused(const ExecutionContext &ctx) { - auto &fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); - auto &fused_transpose_Out = ctx.Attr>("fused_transpose_Out"); - return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); -} - -template -void ExecuteMatMulV2(const ExecutionContext &ctx, - const dnnl::engine onednn_engine, - const phi::DenseTensor *x, - const std::vector &x_dims, - bool trans_x, - const phi::DenseTensor *y, - const std::vector &y_dims, - bool trans_y, - phi::DenseTensor *out) { - std::vector x_strides_override = GetInputStrides(ctx, "X"); - std::vector y_strides_override = GetInputStrides(ctx, "Y"); - MatMulV2MKLDNNHandler handler(ctx, - onednn_engine, - ctx.GetPlace(), - x_dims, - trans_x, - y_dims, - trans_y, - IsOutputFused(ctx), - x_strides_override, - y_strides_override); - - const auto src_memory_p = handler.AcquireSrcMemory(x); - const auto weights_memory_p = handler.AcquireWeightsMemory(y); - const auto dst_memory_p = handler.AcquireDstMemory(out); - - auto matmul_p = handler.AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - if (ctx.HasInput("ResidualData")) { - auto *residual_data = ctx.Input("ResidualData"); - const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data); - matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, - *residual_data_memory_p}); - } - - auto &astream = OneDNNContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); - - // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need - // permute - if (IsOutputFused(ctx) && !phi::funcs::is_int8()) { - auto axis = ctx.Attr>("fused_transpose_Out"); - auto permuted_md = dst_memory_p->get_desc().permute_axes(axis); - out->set_mem_desc(permuted_md.reshape(vectorize(out->dims()))); - } else { - out->set_mem_desc( - dst_memory_p->get_desc().reshape(vectorize(out->dims()))); - } -} - -template -class MatMulMKLDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const ExecutionContext &ctx) const override { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), - 1, - paddle::platform::errors::Unimplemented( - "oneDNN matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - constexpr bool is_int8 = phi::funcs::is_int8(); - constexpr bool is_bfloat16 = phi::funcs::is_bfloat16(); - const bool force_fp32_output = ctx.HasAttr("force_fp32_output") - ? ctx.Attr("force_fp32_output") - : false; - constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses - - const auto &dev_ctx = ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Y"); - auto *out = ctx.Output("Out"); - bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr("trans_x") - : ctx.Attr("transpose_X"); - bool trans_y = ctx.HasAttr("trans_y") ? ctx.Attr("trans_y") - : ctx.Attr("transpose_Y"); - - auto x_dims = vectorize(GetDimForInput(ctx, "X")); - auto y_dims = vectorize(GetDimForInput(ctx, "Y")); - - int ndims = std::max(x_dims.size(), y_dims.size()); - ndims = std::max(ndims, 3); - - std::vector x_bd_dims(ndims, 1); - std::vector y_bd_dims(ndims, 1); - - CalculateMatrixDims(ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, out); - - if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { - ExecuteMatMulV2(ctx, - onednn_engine, - x, - x_bd_dims, - trans_x, - y, - y_bd_dims, - trans_y, - out); - } else if (is_bfloat16) { - ExecuteMatMulV2(ctx, - onednn_engine, - x, - x_bd_dims, - trans_x, - y, - y_bd_dims, - trans_y, - out); - } else if (fuse_relu) { - ExecuteMatMulV2(ctx, - onednn_engine, - x, - x_bd_dims, - trans_x, - y, - y_bd_dims, - trans_y, - out); - } else { - ExecuteMatMulV2(ctx, - onednn_engine, - x, - x_bd_dims, - trans_x, - y, - y_bd_dims, - trans_y, - out); - } - } - - private: - void CalculateMatrixDims(const ExecutionContext &ctx, - const std::vector &x_dims, - const std::vector &y_dims, - std::vector *x_bd_dims, - std::vector *y_bd_dims, - phi::DenseTensor *out) const { - if (x_dims.size() == 1) { - (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0]; - } else if (x_dims.size() == 2) { - (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[1]; - (*x_bd_dims)[(*x_bd_dims).size() - 2] = x_dims[0]; - } else { - for (size_t i = 0; i < x_dims.size(); ++i) { - (*x_bd_dims)[(*x_bd_dims).size() - x_dims.size() + i] = x_dims[i]; - } - } - if (y_dims.size() == 1) { - (*y_bd_dims)[(*x_bd_dims).size() - 2] = y_dims[0]; - } else if (y_dims.size() == 2) { - (*y_bd_dims)[(*y_bd_dims).size() - 1] = y_dims[1]; - (*y_bd_dims)[(*y_bd_dims).size() - 2] = y_dims[0]; - } else { - for (size_t i = 0; i < y_dims.size(); ++i) { - (*y_bd_dims)[(*y_bd_dims).size() - y_dims.size() + i] = y_dims[i]; - } - } - - if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) { - auto out_dims = vectorize(out->dims()); - for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) { - PADDLE_ENFORCE_EQ( - (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 || - (*y_bd_dims)[i] == 1, - true, - paddle::platform::errors::InvalidArgument( - "phi::DenseTensor dimensions are incorrect for broadcasting." - "Dimensions in X and Y must be same or equal to 1, but " - "received x_dim[%d]=%d and y_dims[%d]= %d", - i, - (*x_bd_dims)[i], - i, - (*y_bd_dims)[i])); - (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); - } - out->Resize(phi::make_ddim((out_dims))); - } - } -}; - -template -class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const ExecutionContext &ctx) const override { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), - 1, - paddle::platform::errors::Unimplemented( - "oneDNN matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - - const auto &dev_ctx = ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - auto x = *ctx.Input("X"); - auto y = *ctx.Input("Y"); - auto dout = - *ctx.Input(paddle::framework::GradVarName("Out")); - auto *dx = - ctx.Output(paddle::framework::GradVarName("X")); - auto *dy = - ctx.Output(paddle::framework::GradVarName("Y")); - - bool transpose_x = ctx.HasAttr("transpose_X") - ? ctx.Attr("transpose_X") - : ctx.Attr("trans_x"); - bool transpose_y = ctx.HasAttr("transpose_Y") - ? ctx.Attr("transpose_Y") - : ctx.Attr("trans_y"); - - ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - - paddle::framework::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } - } - - paddle::framework::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - if (transpose_x && transpose_y) { - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &y, true, true, &dout, true, false, dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, true, true, &x, true, false, dy); - } else if (transpose_x) { - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &y, - false, - false, - &dout, - true, - false, - dx); - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &x, - false, - false, - &dout, - false, - true, - dy); - } else if (transpose_y) { - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &dout, - false, - false, - &y, - false, - true, - dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, true, true, &x, false, true, dy); - } else { - this->ExecuteMatMulGrad(ctx, - dev_ctx, - onednn_engine, - &dout, - false, - false, - &y, - true, - false, - dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &x, true, true, &dout, false, true, dy); - } - - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - dx->set_mem_desc(x.mem_desc()); - } - } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - dy->set_mem_desc(y.mem_desc()); - } - } - } - - private: - void ExecuteMatMulGrad(const ExecutionContext &ctx, - const OneDNNContext &dev_ctx, - const dnnl::engine &engine, - phi::DenseTensor *x, - bool trans_x, - bool is_fold_init_dims_x, - phi::DenseTensor *y, - bool trans_y, - bool is_fold_init_dims_y, - phi::DenseTensor *out) const { - // gradient is calculated in a different way when broadcasting is used - bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && - out->dims().size() == 2; - - phi::DenseTensor x_combined, y_combined; - if (!need_combine) { - x_combined = *x; - y_combined = *y; - } else { - x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) - : FoldFirstAndLastDims(dev_ctx, x); - y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) - : FoldFirstAndLastDims(dev_ctx, y); - } - - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; - - MatMulMKLDNNHandler handler(engine, - ctx.GetPlace(), - &x_combined, - trans_x, - &y_combined, - trans_y, - out, - alpha); - - const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); - const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); - const auto dst_memory_p = handler.AcquireDstMemory(out); - - auto matmul_p = handler.AcquireForwardPrimitive(); - - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - auto &astream = OneDNNContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); - - out->set_mem_desc( - dst_memory_p->get_desc().reshape(vectorize(out->dims()))); - } -}; - -} // anonymous namespace - -REGISTER_OP_KERNEL(matmul, - MKLDNN, - ::phi::CPUPlace, - MatMulMKLDNNKernel, - MatMulMKLDNNKernel, - MatMulMKLDNNKernel, - MatMulMKLDNNKernel); - -REGISTER_OP_KERNEL(matmul_grad, - MKLDNN, - ::phi::CPUPlace, - MatMulGradMKLDNNKernel, - MatMulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 94f0fa2a606c3642e835d8184e98186b14bed3e5..001705a3c048ddc094f12f70deefda20e90ddb77 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -99,7 +99,7 @@ const std::unordered_map {"fuse_alpha", ExtraAttrProperty::ONEDNN}, {"fuse_beta", ExtraAttrProperty::ONEDNN}, {"fuse_relu", ExtraAttrProperty::ONEDNN}, - {"fused_output_scale", ExtraAttrProperty::ONEDNN}, + {"alpha", ExtraAttrProperty::ONEDNN}, {"fuse_residual_connection", ExtraAttrProperty::ONEDNN}, {"fuse_with_relu", ExtraAttrProperty::ONEDNN}, {"fused_reshape_Out", ExtraAttrProperty::ONEDNN}, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py index 964aad16b971107ca92a1d41355263c1b7030a60..510fa4f79b21cb111d8586939fb79ae56ee22632 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py @@ -146,7 +146,7 @@ class TestMatmulActivationMkldnnFusePass(PassAutoScanTest): 'operator_scale_onednn_fuse_pass', ], ) - yield config, ['matmul'], (1e-5, 1e-5) + yield config, ['matmul_v2'], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py index 0e0c542be632c3546640d885bb165579af7c756d..cdfbed4c504c3206007360d4a0aef5b647bf91fc 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py @@ -137,7 +137,7 @@ class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest): 'matmul_activation_mkldnn_fuse_pass', ], ) - yield config, ['matmul'], (1e-5, 1e-5) + yield config, ['matmul_v2'], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py index b359d4a4c93c43056550e7cd0f0654502451c4ee..42152e74daa59effdf8f98920195e432f4e8d63b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py @@ -76,7 +76,7 @@ class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest): config = self.create_inference_config( use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass'] ) - yield config, ['matmul'], (1e-5, 1e-5) + yield config, ['matmul_v2'], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py index a2d2260683020777431c572d5ed8104a3991afce..11e61fcf09728941304bf060d745b73748e66eea 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py @@ -116,7 +116,7 @@ class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest): def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul"], (1e-5, 1e-5) + yield config, ["matmul_v2"], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py index 2d368433edc3cafa3a8b661f633493bb44ab0af5..41bc180053877f6782cb2566b110686d0454a2b6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py @@ -135,17 +135,8 @@ class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest): return program_config def sample_predictor_configs(self, program_config): - # gpu_cpu_map_matmul_v2_to_matmul_pass will affect the type of final fused op - fused_op = "matmul_v2" - input1_dim1 = program_config.inputs["input_data1"].shape[0] - input2_dim1 = program_config.inputs["input_data2"].shape[0] - input1_dim2 = program_config.inputs["input_data1"].shape[1] - input2_dim2 = program_config.inputs["input_data2"].shape[1] - if input1_dim1 == input2_dim1 and input1_dim2 == input2_dim2: - fused_op = "matmul" - config = self.create_inference_config(use_mkldnn=True) - yield config, [fused_op], (1e-5, 1e-5) + yield config, ["matmul_v2"], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py index fc4d80060756c75e7b15131e8147047a138cd24d..2ab7184237adc0ca575ce205ac6d8cffb773efa2 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py @@ -153,7 +153,7 @@ class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest): def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) - yield config, ["matmul"], (1e-5, 1e-5) + yield config, ["matmul_v2"], (1e-5, 1e-5) def test(self): self.run_and_statis( diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py index 117af536705fb5cac568144c285c7d5473ea21d0..4304527a2088455fdb50675d62bbb6661b4cd472 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest class TestDnnlMatMulOp(OpTest): @@ -254,321 +254,6 @@ class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp): self.attrs = {'force_fp32_output': True} -@skip_check_grad_ci(reason="DNNL's MatMul doesn't implement grad kernel.") -class TestReshapeTransposeMatMulOp(OpTest): - def init_data_type(self): - self.data_type_ = 'float32' - - def generate_data(self): - self.x = ( - np.random.random([2, 128, 768]) - .astype("float32") - .reshape([2, 128, 12, 64]) - .transpose([0, 2, 1, 3]) - ) - self.y = ( - np.random.random([2, 128, 768]) - .astype("float32") - .reshape([2, 128, 12, 64]) - .transpose([0, 2, 1, 3]) - ) - self.out = np.matmul(self.x, self.y.transpose([0, 1, 3, 2])) - self.fused_reshape_X = [] - self.fused_transpose_X = [] - self.fused_reshape_Y = [] - self.fused_transpose_Y = [] - - def set_op_type_and_transpose_y_name(self): - self.op_type = "matmul" - self.transpose_y_name = "transpose_Y" - - def setUp(self): - self.set_op_type_and_transpose_y_name() - self._cpu_only = True - self.use_mkldnn = True - self.transpose_y = True - self.init_data_type() - self.generate_data() - - self.inputs = {'X': self.x, 'Y': self.y} - self.attrs = { - 'use_mkldnn': self.use_mkldnn, - self.transpose_y_name: self.transpose_y, - } - if len(self.fused_transpose_X) > 0: - self.attrs['fused_transpose_X'] = self.fused_transpose_X - if len(self.fused_transpose_Y) > 0: - self.attrs['fused_transpose_Y'] = self.fused_transpose_Y - if len(self.fused_reshape_X) > 0: - self.attrs['fused_reshape_X'] = self.fused_reshape_X - if len(self.fused_reshape_Y) > 0: - self.attrs['fused_reshape_Y'] = self.fused_reshape_Y - - self.outputs = {'Out': self.out} - - def test_check_output(self): - self.check_output() - - -class TestReshapeTransposeMatMulOp4DXFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = np.random.random([2, 128, 768]).astype("float32") - self.y = ( - np.random.random([2, 128, 768]) - .astype("float32") - .reshape([2, 128, 12, 64]) - .transpose([0, 2, 1, 3]) - ) - self.fused_transpose_X = [0, 2, 1, 3] - self.fused_reshape_X = [0, 0, 12, 64] - self.fused_transpose_Y = [] - self.fused_reshape_Y = [] - self.out = np.matmul( - self.x.reshape([2, 128, 12, 64]).transpose([0, 2, 1, 3]), - self.y.transpose([0, 1, 3, 2]), - ) - - -class TestReshapeTransposeMatMulOp4DXInt8(TestReshapeTransposeMatMulOp4DXFloat): - def init_data_type(self): - self.data_type_ = 'int8' - - -class TestReshapeTransposeMatMulOp4DYFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = ( - np.random.random([2, 128, 768]) - .astype("float32") - .reshape([2, 128, 12, 64]) - .transpose([0, 2, 1, 3]) - ) - self.y = np.random.random([2, 128, 768]).astype("float32") - self.fused_transpose_X = [] - self.fused_reshape_X = [] - self.fused_transpose_Y = [0, 2, 1, 3] - self.fused_reshape_Y = [0, 0, 12, 64] - self.out = np.matmul( - self.x, self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]) - ) - - -class TestReshapeTransposeMatMulOp4DYInt8(TestReshapeTransposeMatMulOp4DYFloat): - def init_data_type(self): - self.data_type_ = 'int8' - - -class TestReshapeTransposeMatMulOp4DXYFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = np.random.random([2, 128, 768]).astype("float32") - self.y = np.random.random([2, 128, 768]).astype("float32") - self.fused_transpose_X = [0, 2, 1, 3] - self.fused_reshape_X = [0, 0, 12, 64] - self.fused_transpose_Y = [0, 2, 1, 3] - self.fused_reshape_Y = [0, 0, 12, 64] - self.out = np.matmul( - self.x.reshape([2, 128, 12, 64]).transpose([0, 2, 1, 3]), - self.y.reshape([2, 128, 12, 64]).transpose([0, 2, 3, 1]), - ) - - -class TestReshapeTransposeMatMulOp4DXYInt8( - TestReshapeTransposeMatMulOp4DXYFloat -): - def init_data_type(self): - self.data_type_ = 'int8' - - -class TestReshapeTransposeMatMulOp2DXFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = np.random.random([2, 5, 10]).astype("float32") - self.y = ( - np.random.random([2, 5, 10]) - .astype("float32") - .reshape([10, 10]) - .transpose([1, 0]) - ) - self.fused_transpose_X = [1, 0] - self.fused_reshape_X = [10, 10] - self.fused_transpose_Y = [] - self.fused_reshape_Y = [] - self.out = np.matmul( - self.x.reshape([10, 10]).transpose([1, 0]), self.y.transpose([1, 0]) - ) - - -class TestReshapeTransposeMatMulOp2DXInt8(TestReshapeTransposeMatMulOp2DXFloat): - def init_data_type(self): - self.data_type_ = 'int8' - - -class TestReshapeTransposeMatMulOp2DYFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = ( - np.random.random([2, 5, 10]) - .astype("float32") - .reshape([10, 10]) - .transpose([1, 0]) - ) - self.y = np.random.random([2, 5, 10]).astype("float32") - self.fused_transpose_X = [] - self.fused_reshape_X = [] - self.fused_transpose_Y = [1, 0] - self.fused_reshape_Y = [10, 10] - self.out = np.matmul(self.x, self.y.reshape([10, 10])) - - -class TestReshapeTransposeMatMulOp2DYInt8(TestReshapeTransposeMatMulOp2DYFloat): - def init_data_type(self): - self.data_type_ = 'int8' - - -class TestReshapeTransposeMatMulOp3DXFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = np.random.random([2, 2, 5, 5]).astype("float32") - self.y = ( - np.random.random([2, 2, 5, 5]) - .astype("float32") - .reshape([2, 10, 5]) - .transpose([0, 2, 1]) - ) - self.fused_transpose_X = [0, 2, 1] - self.fused_reshape_X = [2, 10, 5] - self.fused_transpose_Y = [] - self.fused_reshape_Y = [] - self.out = np.matmul( - self.x.reshape([2, 10, 5]).transpose(0, 2, 1), - self.y.transpose(0, 2, 1), - ) - - -class TestReshapeTransposeMatMulOp3DXInt8(TestReshapeTransposeMatMulOp3DXFloat): - def init_data_type(self): - self.data_type_ = 'int8' - - -class TestReshapeTransposeMatMulOp3DYFloat(TestReshapeTransposeMatMulOp): - def generate_data(self): - self.x = ( - np.random.random([2, 2, 5, 5]) - .astype(self.data_type_) - .reshape([2, 10, 5]) - .transpose([0, 2, 1]) - ) - self.y = np.random.random([2, 2, 5, 5]).astype(self.data_type_) - self.fused_transpose_X = [] - self.fused_reshape_X = [] - self.fused_transpose_Y = [0, 2, 1] - self.fused_reshape_Y = [2, 10, 5] - self.out = np.matmul(self.x, self.y.reshape([2, 10, 5])) - - -class TestReshapeTransposeMatMulOp3DYInt8(TestReshapeTransposeMatMulOp3DYFloat): - def init_data_type(self): - self.data_type_ = 'int8' - - -@skip_check_grad_ci(reason="Tests inference only optimization.") -class TestMatMulOpTransposeReshapeEmptyFloat(OpTest): - def init_data_type(self): - self.data_type_ = np.float32 - - def generate_data(self): - self.bs = 1 - self.x = np.random.random([self.bs, 128, 128]).astype(self.data_type_) - self.y = np.random.random([self.bs, 128, 64]).astype(self.data_type_) - - def init_params_and_out(self): - self.transpose_out = [] - self.reshape_out = [] - self.out = np.matmul(self.x, self.y) - - def set_op_type(self): - self.op_type = "matmul" - - def setUp(self): - self.set_op_type() - self._cpu_only = True - self.use_mkldnn = True - self.init_data_type() - self.generate_data() - self.init_params_and_out() - - self.inputs = {'X': self.x, 'Y': self.y} - self.attrs = {'use_mkldnn': self.use_mkldnn} - - if len(self.reshape_out) > 0: - self.attrs['fused_reshape_Out'] = self.reshape_out - if len(self.transpose_out) > 0: - self.attrs['fused_transpose_Out'] = self.transpose_out - - self.inputs = {'X': self.x, 'Y': self.y} - self.outputs = {'Out': self.out} - - def test_check_output(self): - self.check_output() - - def check_raise_error(self, msg): - try: - self.check_output() - except Exception as e: - if msg in str(e): - raise AttributeError - else: - print(e) - - -class TestMatMulOpTransposeReshapeIntEmptyInt( - TestMatMulOpTransposeReshapeEmptyFloat -): - def init_data_type(self): - self.data_type_ = np.int8 - - -class TestMatMulOpTransposeReshapeBasicFloat( - TestMatMulOpTransposeReshapeEmptyFloat -): - def generate_data(self): - self.bs = 8 - self.x = np.random.random([self.bs, 12, 128, 128]).astype( - self.data_type_ - ) - self.y = np.random.random([self.bs, 12, 128, 64]).astype( - self.data_type_ - ) - - def init_params_and_out(self): - self.transpose_out = [0, 2, 1, 3] - self.reshape_out = [0, 0, self.x.shape[1] * self.y.shape[-1]] - self.out = ( - np.matmul(self.x, self.y) - .transpose([0, 2, 1, 3]) - .reshape([self.bs, -1, self.x.shape[1] * self.y.shape[-1]]) - ) - - -class TestMatMulOpTransposeReshapeBasicInt( - TestMatMulOpTransposeReshapeBasicFloat -): - def init_data_type(self): - self.data_type_ = np.int8 - - -class TestMatMulOpTransposeReshapeOtherDimFloat( - TestMatMulOpTransposeReshapeBasicFloat -): - def generate_data(self): - self.bs = 11 - self.x = np.random.random([self.bs, 12, 14, 18]).astype(self.data_type_) - self.y = np.random.random([self.bs, 12, 18, 13]).astype(self.data_type_) - - -class TestMatMulOpTransposeReshapeOtherDimInt( - TestMatMulOpTransposeReshapeOtherDimFloat -): - def init_data_type(self): - self.data_type_ = np.int8 - - if __name__ == "__main__": from paddle import enable_static