diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a7505890f41d441cbbf958cda3e86b36343e1b2c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using MLUDeviceContext = platform::MLUDeviceContext;
+
+static void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
+                          const framework::DDim& target_ddims,
+                          std::vector<int>* axes) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+  for (int64_t i = 0; i < src_dim_size; ++i) {
+    if (i < axis || i >= target_dim_size + axis) {
+      axes->push_back(i);
+      continue;
+    }
+    if (src_ddims[i] > target_ddims[i - axis]) {
+      axes->push_back(i);
+    }
+  }
+}
+
+template <typename T>
+class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                           y_dims_array.data(), out_dims_array.data(), max_dim,
+                           axis);
+
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
+                      y_desc.get(), GetBasePtr(y), out_desc.get(),
+                      GetBasePtr(out), ToCnnlDataType<T>());
+  }
+};
+
+template <typename T>
+class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
+                     : axis);
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                           y_dims_array.data(), out_dims_array.data(), max_dim,
+                           axis);
+
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      if (dx->dims() == dout->dims()) {
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), y_desc.get(), GetBasePtr(y),
+                          x_desc.get(), GetBasePtr(dx), ToCnnlDataType<T>());
+      } else {
+        Tensor dx_temp(x->dtype());
+        dx_temp.Resize(dout->dims());
+        dx_temp.mutable_data<T>(ctx.GetPlace());
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), y_desc.get(), GetBasePtr(y),
+                          dout_desc.get(), GetBasePtr(&dx_temp),
+                          ToCnnlDataType<T>());
+
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dx_desc(*dx);
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(&dx_temp), 0,
+                        nullptr, nullptr, dx_desc.get(), GetBasePtr(dx));
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      if (dy->dims() == dout->dims()) {
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), x_desc.get(), GetBasePtr(x),
+                          y_desc.get(), GetBasePtr(dy), ToCnnlDataType<T>());
+      } else {
+        Tensor dy_temp(y->dtype());
+        dy_temp.Resize(dout->dims());
+        dy_temp.mutable_data<T>(ctx.GetPlace());
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(),
+                          GetBasePtr(dout), x_desc.get(), GetBasePtr(x),
+                          dout_desc.get(), GetBasePtr(&dy_temp),
+                          ToCnnlDataType<T>());
+
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dy_desc(*dy);
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(&dy_temp), 0,
+                        nullptr, nullptr, dy_desc.get(), GetBasePtr(dy));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(elementwise_mul, ops::ElementwiseMulMLUKernel<float>,
+                       ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
+                       ops::ElementwiseMulMLUKernel<int>);
+
+REGISTER_OP_MLU_KERNEL(
+    elementwise_mul_grad, ops::ElementwiseMulGradMLUKernel<float>,
+    ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
+    ops::ElementwiseMulGradMLUKernel<int>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a08c39ffc8c96f55f3367cf183448cf55d9ba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py
@@ -0,0 +1,240 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, compiler, program_guard
+from paddle.fluid.op import Operator
+
+import sys
+sys.path.append('..')
+from op_test import OpTest, skip_check_grad_ci
+
+paddle.enable_static()
+
+
+class ElementwiseMulOp(OpTest):
+    def init_kernel_type(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set('Y'))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x * self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 2, 11).astype(np.float32),
+            'Y': np.random.rand(10, 1, 11).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
+            'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 10).astype(np.float32),
+            'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
+        }
+        self.init_kernel_type()
+
+
+class TestElementwiseMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_mul must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1)
+
+            # the input dtype of elementwise_mul must be float16 or float32 or int32
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
+
+
+if __name__ == '__main__':
+    unittest.main()