diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
index ed9f6230720f83100e641068c8664d643b6db260..60f4e4b309c5d817a0d0e8eaf07f505d55837b93 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-USE_OP(mul);
+USE_OP_ITSELF(mul);
 USE_OP(cinn_launch);
 USE_OP_ITSELF(elementwise_add);
 namespace paddle::framework {
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index 47dffd47b7cbbf4a37e6715b40d41024330bc679..c11c7124b62777a8a5da12f67594d01f255f6637 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -674,7 +674,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 }  // namespace paddle
 
 USE_PASS(build_cinn_pass);
-USE_OP(mul);
+USE_OP_ITSELF(mul);
 USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(relu_grad);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index cdccc4c5546900a141a084281f419c2940b23817..44f4424d70d4c85e1a4d447feba14a3f1b92c267 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -300,6 +300,6 @@ TEST(CinnCompilerTest, Compile) {
 
 USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
-USE_OP(mul);
+USE_OP_ITSELF(mul);
 USE_OP_ITSELF(relu);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc
index 7ec21385bb73750aaea501fb3b99c2ea76f52a68..4a0b99518a63f87e67c33d96f06a39e1e904cee1 100644
--- a/paddle/fluid/imperative/tests/test_eager.cc
+++ b/paddle/fluid/imperative/tests/test_eager.cc
@@ -98,4 +98,4 @@ TEST(test_var_helper, eager_var_helper) {
 }  // namespace imperative
 }  // namespace paddle
 
-USE_OP(mul);
+USE_OP_ITSELF(mul);
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 02a1689c23a3fe5e1543a2e52d7661d5997bc062..eb7e327662c3013e0ac6ee8ef12dbe484db2a4ce 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -28,6 +28,8 @@
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
 
 namespace platform = paddle::platform;
 namespace framework = paddle::framework;
@@ -267,7 +269,7 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
 }  // namespace imperative
 }  // namespace paddle
 
-USE_OP(mul);
-USE_OP(mul_grad);
+USE_OP_ITSELF(mul);
+USE_OP_ITSELF(mul_grad);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(elementwise_add_grad);
diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc
index 3fa87d415db0d9946ce3df8b95dc3641bbe9f9c3..3e5ab9ab9636801b487ed536ae486bf95ff63de1 100644
--- a/paddle/fluid/imperative/tests/test_layer.cc
+++ b/paddle/fluid/imperative/tests/test_layer.cc
@@ -416,4 +416,4 @@ TEST(test_layer, test_eager) {
 }  // namespace imperative
 }  // namespace paddle
 
-USE_OP(mul);
+USE_OP_ITSELF(mul);
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 75876e07fb5c78fb6ec6949489efac9fcf618a69..1c3a04b51abd036325801af484bb1d800152c328 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -34,9 +34,13 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
 #endif
 
 namespace imperative = paddle::imperative;
@@ -598,8 +602,8 @@ TEST(test_tracer, eager_tracer) {
 }  // namespace imperative
 }  // namespace paddle
 
-USE_OP(mul);
-USE_OP(mul_grad);
+USE_OP_ITSELF(mul);
+USE_OP_ITSELF(mul_grad);
 USE_OP_ITSELF(reduce_sum);
 USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
index 1ae2668e733aad23241c63b9985e708396d0b1bc..8134d389469cbe7d654fd675a75a8123257339b1 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -43,4 +43,4 @@ TEST(fc_op, test) {
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
-USE_OP(mul);
+USE_OP_ITSELF(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index 282f53559aa75b2c7c252450e392e1996f9b1d81..86cb7543d42da65cc9f82cd13b06610fe532c164 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -46,4 +46,4 @@ TEST(MulOpConverter, main) {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(mul);
+USE_OP_ITSELF(mul);
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index fe9faab7d6449c6c9ca7b3fbda30662e6174eace..0f70b67bbbd68d00f2d8f341e3d797062aa14b64 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <string>
 
-#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace phi {
@@ -46,6 +46,9 @@ using dnnl::memory;
 using dnnl::prop_kind;
 using dnnl::stream;
 
+constexpr int kMULMKLDNNINT8 = 1;
+constexpr int kMULMKLDNNFP32 = 2;
+
 template <typename XT, typename YT, typename OT>
 class MulPrimitiveFactory {
  public:
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index bc57b429127f0a909eda133de30cdd84acc42b32..6738f15ef74c67b336fbebf3f0ff9cd37a0d93da 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/mul_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -27,6 +27,9 @@ namespace operators {
 using framework::OpKernelType;
 using framework::Tensor;
 
+constexpr int kMULMKLDNNINT8 = 1;
+constexpr int kMULMKLDNNFP32 = 2;
+
 class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -354,16 +357,3 @@ REGISTER_OPERATOR(mul_grad, ops::MulGradOp,
                   ops::MulDoubleGradMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    mul_grad_grad,
-    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
deleted file mode 100644
index 6e841712b9bffc06ca56afddcb866af8b3f9b0d8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
-                        ops::MulKernel<plat::CUDADeviceContext, double>,
-                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-    ops::MulGradKernel<plat::CUDADeviceContext, double>,
-    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad_grad,
-    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
deleted file mode 100644
index ce91c6dd0edf1fde90a15b1929fa4560a65f555e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mul_op.h
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-constexpr int kMULMKLDNNINT8 = 1;
-constexpr int kMULMKLDNNFP32 = 2;
-
-template <typename DeviceContext, typename T>
-class MulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* x = context.Input<Tensor>("X");
-    const Tensor* y = context.Input<Tensor>("Y");
-    Tensor* z = context.Output<Tensor>("Out");
-    const Tensor x_matrix =
-        x->dims().size() > 2
-            ? framework::ReshapeToMatrix(
-                  *x, context.template Attr<int>("x_num_col_dims"))
-            : *x;
-    const Tensor y_matrix =
-        y->dims().size() > 2
-            ? framework::ReshapeToMatrix(
-                  *y, context.template Attr<int>("y_num_col_dims"))
-            : *y;
-
-    z->mutable_data<T>(context.GetPlace());
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto x_matrix = x->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                        : static_cast<const Tensor&>(*x);
-    auto y_matrix = y->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                        : static_cast<const Tensor&>(*y);
-    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize({phi::flatten_to_2d(x->dims(), x_num_col_dims)[0],
-                     phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
-
-    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                             : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                             : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto x_mat = x->dims().size() > 2
-                     ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                     : static_cast<const Tensor&>(*x);
-    auto y_mat = y->dims().size() > 2
-                     ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                     : static_cast<const Tensor&>(*y);
-
-    const int m = phi::flatten_to_2d(x->dims(), x_num_col_dims)[0];
-    const int n = phi::flatten_to_2d(y->dims(), y_num_col_dims)[1];
-
-    auto* dout = ctx.Input<framework::LoDTensor>("DOut");
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize({m, n});
-
-    auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
-    auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
-
-    auto* dx = ctx.Output<framework::LoDTensor>("DX");
-    auto* dy = ctx.Output<framework::LoDTensor>("DY");
-    auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
-
-    Tensor ddout_mat;
-    if (ddout) {
-      ddout->set_lod(dout->lod());
-      // allocate and reshape ddout
-      ddout->mutable_data<T>(ctx.GetPlace());
-      ddout_mat.ShareDataWith(*ddout);
-      ddout_mat.Resize({m, n});
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    // a flag to specify whether ddout value has been set, if flag
-    // is false, MatMul beta should be 0 to set ddout, if flag is
-    // true, MatMul beta should be 1 to add result to ddout.
-    bool ddout_flag = false;
-    if (ddx) {
-      auto ddx_mat = ddx->dims().size() > 2
-                         ? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
-                         : static_cast<const Tensor&>(*ddx);
-
-      // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
-      if (dy) {
-        dy->set_lod(y->lod());
-        // allocate and reshape dy
-        dy->mutable_data<T>(ctx.GetPlace());
-        Tensor dy_mat = dy->dims().size() > 2
-                            ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                            : *dy;
-        blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
-      }
-      // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
-      if (ddout) {
-        blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
-                    &ddout_mat, static_cast<T>(ddout_flag));
-        ddout_flag = true;
-      }
-    }
-    if (ddy) {
-      auto ddy_mat = ddy->dims().size() > 2
-                         ? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
-                         : static_cast<const Tensor&>(*ddy);
-      // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
-      if (dx) {
-        dx->set_lod(x->lod());
-        // allocate and reshape dx
-        dx->mutable_data<T>(ctx.GetPlace());
-        Tensor dx_mat = dx->dims().size() > 2
-                            ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                            : *dx;
-        blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
-      }
-      // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
-      if (ddout) {
-        blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
-                    &ddout_mat, static_cast<T>(ddout_flag));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
index e1fb5f4f6b0f827e753931d2d4970d2f7859f423..2aedfed9f8e497b7e931c9aead4ff3c8da92c19b 100644
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc
index 1fdaa2729909af54653ca840576993d521d1de77..6ef41e059c7d99c3327994a9fac6fdaf5290bfa5 100644
--- a/paddle/fluid/operators/mul_op_xpu.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/mul_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
index c68e8115e898b3701b9f568ac501260615b69ad4..aba519ff04849a54bfe1a69a6381f4298822279f 100644
--- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
@@ -45,3 +45,17 @@ PD_REGISTER_KERNEL(matmul_triple_grad,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(matmul_with_flatten_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenGradKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc
index 2bf56c07a5bc7485fd29d6ac347a5311915d8f36..8aa25c0da07d9617d1734647d511e3707e60ebc3 100644
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -28,3 +28,10 @@ PD_REGISTER_KERNEL(matmul,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(matmul_with_flatten,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index ff23ebd05b52833eef9fd23efb1d8537d1013454..9c80d5e151c1c1bb01d27234c9d2ecf12a361d3b 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -49,3 +49,19 @@ PD_REGISTER_KERNEL(matmul_triple_grad,
                    phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(matmul_with_flatten_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(matmul_with_flatten_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 98be79c5f9dab5f1a72d7784dfbe1745d27bd622..20c9a5229aaa66dc7f4663117ef0c102cb41a12d 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -30,3 +30,11 @@ PD_REGISTER_KERNEL(matmul,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(matmul_with_flatten,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 495b93f2a4ef0f790d53605e4531af7040c6b2ad..25a9db868d35705330dbf70caee18328a457e46b 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -1731,4 +1731,163 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void MatmulWithFlattenGradKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& out_grad,
+                                 int x_num_col_dims,
+                                 int y_num_col_dims,
+                                 DenseTensor* x_grad,
+                                 DenseTensor* y_grad) {
+  auto x_matrix = x.dims().size() > 2
+                      ? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
+                      : x;
+  auto y_matrix = y.dims().size() > 2
+                      ? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
+                      : y;
+  auto* dout = &out_grad;
+
+  DenseTensor dout_mat(*dout);
+  dout_mat.Resize({phi::flatten_to_2d(x.dims(), x_num_col_dims)[0],
+                   phi::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
+
+  auto* dx = x_grad;
+  auto* dy = y_grad;
+
+  if (dx != nullptr) {
+    dx->set_lod(x.lod());
+  }
+  if (dy != nullptr) {
+    dy->set_lod(y.lod());
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+    DenseTensor dx_matrix =
+        dx->dims().size() > 2
+            ? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims)
+            : *dx;
+
+    // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+    blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+  }
+  if (dy) {
+    dev_ctx.template Alloc<T>(dy);
+    DenseTensor dy_matrix =
+        dy->dims().size() > 2
+            ? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims)
+            : *dy;
+    // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+    blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+  }
+}
+
+template <typename T, typename Context>
+void MatmulWithFlattenDoubleGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    const DenseTensor& out_grad,
+    paddle::optional<const DenseTensor&> x_grad_grad,
+    paddle::optional<const DenseTensor&> y_grad_grad,
+    int x_num_col_dims,
+    int y_num_col_dims,
+    DenseTensor* x_grad,
+    DenseTensor* y_grad,
+    DenseTensor* out_grad_grad) {
+  auto x_mat = x.dims().size() > 2
+                   ? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
+                   : x;
+  auto y_mat = y.dims().size() > 2
+                   ? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
+                   : y;
+
+  const int m = phi::flatten_to_2d(x.dims(), x_num_col_dims)[0];
+  const int n = phi::flatten_to_2d(y.dims(), y_num_col_dims)[1];
+
+  auto* dout = &out_grad;
+  DenseTensor dout_mat(*dout);
+  dout_mat.Resize({m, n});
+
+  auto* ddx = x_grad_grad.get_ptr();
+  auto* ddy = y_grad_grad.get_ptr();
+
+  auto* dx = x_grad;
+  auto* dy = y_grad;
+  auto* ddout = out_grad_grad;
+
+  DenseTensor ddout_mat;
+  if (ddout) {
+    ddout->set_lod(dout->lod());
+    // allocate and reshape ddout
+    dev_ctx.template Alloc<T>(ddout);
+    ddout_mat.ShareDataWith(*ddout);
+    ddout_mat.Resize({m, n});
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  // a flag to specify whether ddout value has been set, if flag
+  // is false, MatMul beta should be 0 to set ddout, if flag is
+  // true, MatMul beta should be 1 to add result to ddout.
+  bool ddout_flag = false;
+  if (ddx) {
+    auto ddx_mat =
+        ddx->dims().size() > 2
+            ? paddle::framework::ReshapeToMatrix(*ddx, x_num_col_dims)
+            : static_cast<const DenseTensor&>(*ddx);
+
+    // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
+    if (dy) {
+      dy->set_lod(y.lod());
+      // allocate and reshape dy
+      dev_ctx.template Alloc<T>(dy);
+      DenseTensor dy_mat =
+          dy->dims().size() > 2
+              ? paddle::framework::ReshapeToMatrix(*dy, y_num_col_dims)
+              : *dy;
+      blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
+    }
+    // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
+    if (ddout) {
+      blas.MatMul(ddx_mat,
+                  false,
+                  y_mat,
+                  false,
+                  static_cast<T>(1.0),
+                  &ddout_mat,
+                  static_cast<T>(ddout_flag));
+      ddout_flag = true;
+    }
+  }
+  if (ddy) {
+    auto ddy_mat =
+        ddy->dims().size() > 2
+            ? paddle::framework::ReshapeToMatrix(*ddy, y_num_col_dims)
+            : static_cast<const DenseTensor&>(*ddy);
+    // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
+    if (dx) {
+      dx->set_lod(x.lod());
+      // allocate and reshape dx
+      dev_ctx.template Alloc<T>(dx);
+      DenseTensor dx_mat =
+          dx->dims().size() > 2
+              ? paddle::framework::ReshapeToMatrix(*dx, x_num_col_dims)
+              : *dx;
+      blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
+    }
+    // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
+    if (ddout) {
+      blas.MatMul(x_mat,
+                  false,
+                  ddy_mat,
+                  false,
+                  static_cast<T>(1.0),
+                  &ddout_mat,
+                  static_cast<T>(ddout_flag));
+    }
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index f6136de5d8d0c3d04c83b0446abc82d0eeb11376..3201923e1b2c6f3dacf307153bdff97c3c20fa97 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -506,4 +506,34 @@ void MatmulKernel(const Context& dev_ctx,
   MatMulFunction<Context, T>(dev_ctx, x, y, out, transpose_x, transpose_y);
 }
 
+template <typename T, typename Context>
+void MatmulWithFlattenKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int x_num_col_dims,
+                             int y_num_col_dims,
+                             DenseTensor* out) {
+  const DenseTensor x_matrix =
+      x.dims().size() > 2
+          ? paddle::framework::ReshapeToMatrix(x, x_num_col_dims)
+          : x;
+  const DenseTensor y_matrix =
+      y.dims().size() > 2
+          ? paddle::framework::ReshapeToMatrix(y, y_num_col_dims)
+          : y;
+
+  dev_ctx.template Alloc<T>(out);
+  auto z_dim = out->dims();
+  if (z_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+
+  blas.MatMul(x_matrix, y_matrix, out);
+  if (z_dim.size() != 2) {
+    out->Resize(z_dim);
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/matmul_grad_kernel.h b/paddle/phi/kernels/matmul_grad_kernel.h
index 10452ff0b7903cfd017f0ebbd73d42df52579b84..41a835db46f71e46daa92783cecd502f46f72186 100644
--- a/paddle/phi/kernels/matmul_grad_kernel.h
+++ b/paddle/phi/kernels/matmul_grad_kernel.h
@@ -60,4 +60,28 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                             DenseTensor* out_d_ddx,
                             DenseTensor* out_d_ddy);
 
+template <typename T, typename Context>
+void MatmulWithFlattenGradKernel(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& out_grad,
+                                 int x_num_col_dims,
+                                 int y_num_col_dims,
+                                 DenseTensor* x_grad,
+                                 DenseTensor* y_grad);
+
+template <typename T, typename Context>
+void MatmulWithFlattenDoubleGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    const DenseTensor& out_grad,
+    paddle::optional<const DenseTensor&> x_grad_grad,
+    paddle::optional<const DenseTensor&> y_grad_grad,
+    int x_num_col_dims,
+    int y_num_col_dims,
+    DenseTensor* x_grad,
+    DenseTensor* y_grad,
+    DenseTensor* out_grad_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/matmul_kernel.h b/paddle/phi/kernels/matmul_kernel.h
index b524b9e5863dcbcacaea11df9a96b71570312213..a4c4971499fdf713bff482e0a28001ae9e3c6957 100644
--- a/paddle/phi/kernels/matmul_kernel.h
+++ b/paddle/phi/kernels/matmul_kernel.h
@@ -29,6 +29,16 @@ void MatmulKernel(const Context& dev_ctx,
                   bool transpose_y,
                   DenseTensor* out);
 
+// In order to be compatible with `mul` op in fluid,
+// it is no longer used in 2.x API
+template <typename T, typename Context>
+void MatmulWithFlattenKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int x_num_col_dims,
+                             int y_num_col_dims,
+                             DenseTensor* out);
+
 template <typename T, typename Context>
 DenseTensor Matmul(const Context& dev_ctx,
                    const DenseTensor& x,
diff --git a/paddle/phi/ops/compat/mul_sig.cc b/paddle/phi/ops/compat/mul_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8770db1039eb6d38ca36d0cd7d5ac1711eb12f21
--- /dev/null
+++ b/paddle/phi/ops/compat/mul_sig.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MulGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("matmul_with_flatten_grad",
+                         {"X", "Y", GradVarName("Out")},
+                         {"x_num_col_dims", "y_num_col_dims"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+KernelSignature MulDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("matmul_with_flatten_double_grad",
+                         {"X", "Y", "DOut", "DDX", "DDY"},
+                         {"x_num_col_dims", "y_num_col_dims"},
+                         {"DX", "DY", "DDOut"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(mul, matmul_with_flatten);
+PD_REGISTER_BASE_KERNEL_NAME(mul_grad, matmul_with_flatten_grad);
+PD_REGISTER_BASE_KERNEL_NAME(mul_grad_grad, matmul_with_flatten_double_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(mul_grad, phi::MulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(mul_grad_grad, phi::MulDoubleGradOpArgumentMapping);