[MLU]add op: reduce_sum, elementwise_sub (#41697)

* [MLU]add op: reduce_sum, elementwise_sub * [MLU]del unrelated code

[MLU]add op: reduce_sum, elementwise_sub (#41697)
* [MLU]add op: reduce_sum, elementwise_sub * [MLU]del unrelated code
9f06069d · qipengh · GitHub · 0ef3ef28 · 9f06069d · 9f06069d
10 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
 namespace paddle {
 namespace operators {
@@ -23,35 +22,7 @@ template <typename T>
 class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
+    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_ADD);
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                           y_dims_array.data(), out_dims_array.data(), max_dim,
-                           axis);
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(),
-                             ToCnnlDataType(x->type()));
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(),
-                             ToCnnlDataType(y->type()));
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(),
-                                       CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
-                      y_desc.get(), GetBasePtr(y), out_desc.get(),
-                      GetBasePtr(out), ToCnnlDataType<T>());
  }
 };
@@ -75,22 +46,8 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
      if (dx->dims() != dout->dims()) {
        std::vector<int> dst_dims_vec;
        std::vector<int> reduce_axes;
-        auto src_dims = dx->dims();
+        GetReduceAxesAndDstDims(axis, dout->dims(), dx->dims(), &reduce_axes,
-        auto dout_dims = dout->dims();
+                                &dst_dims_vec);
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (dst_dims_vec.size() == 0) {
-          // x is scalar
-          dst_dims_vec.push_back(1);
-        }
        MLUCnnlReduceDesc reduction_desc(
            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
@@ -109,22 +66,8 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
      if (dy->dims() != dout->dims()) {
        std::vector<int> dst_dims_vec;
        std::vector<int> reduce_axes;
-        auto src_dims = dy->dims();
+        GetReduceAxesAndDstDims(axis, dout->dims(), dy->dims(), &reduce_axes,
-        auto dout_dims = dout->dims();
+                                &dst_dims_vec);
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (dst_dims_vec.size() == 0) {
-          // y is scalar
-          dst_dims_vec.push_back(1);
-        }
        MLUCnnlReduceDesc reduction_desc(
            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),

--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_MLU
+#include <vector>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+namespace paddle {
+namespace operators {
+inline void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
+                          const framework::DDim& target_ddims,
+                          std::vector<int>* axes) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+  for (int64_t i = 0; i < src_dim_size; ++i) {
+    if (i < axis || i >= target_dim_size + axis) {
+      axes->push_back(i);
+      continue;
+    }
+    if (src_ddims[i] > target_ddims[i - axis]) {
+      axes->push_back(i);
+    }
+  }
+}
+inline void GetReduceAxesAndDstDims(const int axis,
+                                    const framework::DDim& src_ddims,
+                                    const framework::DDim& target_ddims,
+                                    std::vector<int>* reduce_axes,
+                                    std::vector<int>* dst_dims_vec) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+  int src_axis = (target_dim_size < src_dim_size ? axis : 0);
+  for (int ax = 0; ax < src_dim_size; ++ax) {
+    if ((ax < src_axis || ax >= src_axis + target_dim_size) ||
+        (src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) {
+      reduce_axes->push_back(ax);
+    } else {
+      dst_dims_vec->push_back(src_ddims[ax]);
+    }
+  }
+  if (dst_dims_vec->size() == 0) {
+    // target_var is scalar
+    dst_dims_vec->push_back(1);
+  }
+}
+template <typename T>
+void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
+                       const cnnlOpTensorDesc_t op_tensor_op) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(ctx.GetPlace()), true,
+      platform::errors::Unavailable("This kernel only runs on MLU."));
+  PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) ||
+                        (op_tensor_op == CNNL_OP_TENSOR_SUB) ||
+                        (op_tensor_op == CNNL_OP_TENSOR_MUL),
+                    true,
+                    platform::errors::Unavailable(
+                        "This kernel of MLU only support ADD, SUB, MUL."));
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+  int axis = ctx.Attr<int>("axis");
+  const auto& x_dims = x->dims();
+  const auto& y_dims = y->dims();
+  axis =
+      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out);
+  MLUCnnlOpTensorDesc op_tensor_desc(op_tensor_op, ToCnnlDataType<T>(),
+                                     CNNL_NOT_PROPAGATE_NAN);
+  MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
+                    y_desc.get(), GetBasePtr(y), out_desc.get(),
+                    GetBasePtr(out), ToCnnlDataType<T>());
+}
+// ------------------ BinaryOp -----------------
+enum BINARY_FUNCTOR {
+  DIV,
+  DIVNONAN,
+};
+template <BINARY_FUNCTOR func>
+void MLUBinary(const framework::ExecutionContext& ctx,
+               cnnlComputationPreference_t prefer,
+               const cnnlTensorDescriptor_t x_desc, const void* x,
+               const cnnlTensorDescriptor_t y_desc, const void* y,
+               const cnnlTensorDescriptor_t out_desc, void* out);
+template <>
+inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
+                           cnnlComputationPreference_t prefer,
+                           const cnnlTensorDescriptor_t x_desc, const void* x,
+                           const cnnlTensorDescriptor_t y_desc, const void* y,
+                           const cnnlTensorDescriptor_t out_desc, void* out) {
+  MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
+}
+template <BINARY_FUNCTOR Functor, typename T>
+void MLUBinaryOp(const framework::ExecutionContext& ctx) {
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+  int axis = ctx.Attr<int>("axis");
+  const auto& x_dims = x->dims();
+  const auto& y_dims = y->dims();
+  axis =
+      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
+  MLUBinary<Functor>(ctx, prefer_type, x_desc.get(), GetBasePtr(x),
+                     y_desc.get(), GetBasePtr(y), out_desc.get(),
+                     GetBasePtr(out));
+}
+// ------------------ UnaryOp -----------------
+enum UNARY_FUNCTOR {
+  NEG,
+  RECIPROCAL,
+};
+template <UNARY_FUNCTOR func>
+void MLUUnary(const framework::ExecutionContext& ctx,
+              cnnlComputationPreference_t prefer,
+              const cnnlTensorDescriptor_t input_desc, const void* input,
+              const cnnlTensorDescriptor_t ouput_desc, void* output);
+template <>
+inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
+                          cnnlComputationPreference_t prefer,
+                          const cnnlTensorDescriptor_t input_desc,
+                          const void* input,
+                          const cnnlTensorDescriptor_t output_desc,
+                          void* output) {
+  MLUCnnl::Neg(ctx, input_desc, input, output_desc, output);
+}
+template <>
+inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
+                                 cnnlComputationPreference_t prefer,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output);
+}
+template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
+void MLUUnaryOp(const framework::ExecutionContext& ctx) {
+  auto* x = ctx.Input<Tensor>("X");
+  auto* out = ctx.Output<Tensor>("Out");
+  out->mutable_data<Tout>(ctx.GetPlace());
+  MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tin>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tout>());
+  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
+  MLUUnary<Functor>(ctx, prefer_type, x_desc.get(), GetBasePtr(x),
+                    out_desc.get(), GetBasePtr(out));
+}
+}  // namespace operators
+}  // namespace paddle
+#endif
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
 namespace paddle {
 namespace operators {
@@ -21,53 +20,11 @@ namespace operators {
 using Tensor = framework::Tensor;
 using MLUDeviceContext = platform::MLUDeviceContext;
-static void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
-                          const framework::DDim& target_ddims,
-                          std::vector<int>* axes) {
-  int64_t src_dim_size = src_ddims.size();
-  int64_t target_dim_size = target_ddims.size();
-  for (int64_t i = 0; i < src_dim_size; ++i) {
-    if (i < axis || i >= target_dim_size + axis) {
-      axes->push_back(i);
-      continue;
-    }
-    if (src_ddims[i] > target_ddims[i - axis]) {
-      axes->push_back(i);
-    }
-  }
-}
 template <typename T>
 class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_MUL);
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                           y_dims_array.data(), out_dims_array.data(), max_dim,
-                           axis);
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
-                                       CNNL_NOT_PROPAGATE_NAN);
-    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
-                      y_desc.get(), GetBasePtr(y), out_desc.get(),
-                      GetBasePtr(out), ToCnnlDataType<T>());
  }
 };

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_SUB);
+  }
+};
+template <typename T>
+class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      if (dx->dims() != dout->dims()) {
+        std::vector<int> dst_dims_vec;
+        std::vector<int> reduce_axes;
+        GetReduceAxesAndDstDims(axis, dout->dims(), dx->dims(), &reduce_axes,
+                                &dst_dims_vec);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dx_desc(dst_dims_vec.size(), dst_dims_vec.data(),
+                                  ToCnnlDataType<T>());
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(dout), 0, nullptr,
+                        nullptr, dx_desc.get(), GetBasePtr(dx));
+      } else {
+        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      if (dy->dims() != dout->dims()) {
+        std::vector<int> dst_dims_vec;
+        std::vector<int> reduce_axes;
+        GetReduceAxesAndDstDims(axis, dout->dims(), dy->dims(), &reduce_axes,
+                                &dst_dims_vec);
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dy_desc(dst_dims_vec.size(), dst_dims_vec.data(),
+                                  ToCnnlDataType<T>());
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(dout), 0, nullptr,
+                        nullptr, dy_desc.get(), GetBasePtr(dy));
+        tmp_dout = dy;
+      }
+      // call neg op, dy = -dout
+      MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout);
+      MLUCnnlTensorDesc dy_desc(*dy);
+      MLUUnary<NEG>(ctx, CNNL_COMPUTATION_HIGH_PRECISION, tmp_dout_desc.get(),
+                    GetBasePtr(tmp_dout), dy_desc.get(), GetBasePtr(dy));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(elementwise_sub, ops::ElementwiseSubMLUKernel<int>,
+                       ops::ElementwiseSubMLUKernel<float>,
+                       ops::ElementwiseSubMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradMLUKernel<int>,
+                       ops::ElementwiseSubGradMLUKernel<float>,
+                       ops::ElementwiseSubGradMLUKernel<plat::float16>);
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -45,6 +45,22 @@ enum MLULogicMethod {
  CNNL_LOGIC_OP_OR = 7,
 };
+const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
+    {"reduce_all", CNNL_REDUCE_AND},  {"reduce_any", CNNL_REDUCE_OR},
+    {"reduce_max", CNNL_REDUCE_MAX},  {"reduce_mean", CNNL_REDUCE_AVG},
+    {"reduce_min", CNNL_REDUCE_MIN},  {"reduce_sum", CNNL_REDUCE_ADD},
+    {"reduce_prod", CNNL_REDUCE_MUL},
+};
+inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) {
+  auto iter = MLUReduceOpMap.find(reduce_name);
+  if (iter != MLUReduceOpMap.end()) {
+    return iter->second;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "Not support reduce op type of MLU Device: %s", reduce_name));
+}
 inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
 inline void* GetBasePtr(Tensor* t) { return t->data(); }

--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -12,9 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
 namespace paddle {
 namespace operators {
@@ -23,42 +21,7 @@ template <typename T>
 class ReduceMeanMLUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
+    MLUReduceOp<T>(context, "reduce_mean");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = phi::vectorize(input->dims());
-    const auto& input_dim_size = input->dims().size();
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      for (size_t i = 0; i < input_dims.size(); i++) {
-        reduce_dims.push_back(static_cast<int>(i));
-      }
-    } else {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        if (dims[i] < 0) {
-          reduce_dims.push_back(dims[i] + input_dim_size);
-        } else {
-          reduce_dims.push_back(dims[i]);
-        }
-      }
-    }
-    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
-                                 ToCnnlDataType(input->dtype()));
-    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
-                                  ToCnnlDataType(output->dtype()));
-    MLUCnnlReduceDesc reduction_desc(
-        reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
-        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
-    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
-                    nullptr, input_desc.get(), GetBasePtr(input),
-                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
-                    GetBasePtr(output));
  }
 };

--- a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_MLU
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+void MLUReduceOp(const framework::ExecutionContext& context,
+                 std::string reduce_name) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(context.GetPlace()), true,
+      platform::errors::Unavailable("This kernel only runs on MLU."));
+  auto* input = context.Input<Tensor>("X");
+  auto* output = context.Output<Tensor>("Out");
+  output->mutable_data<T>(context.GetPlace());
+  bool reduce_all = context.Attr<bool>("reduce_all");
+  auto dims = context.Attr<std::vector<int>>("dim");
+  auto input_dims = phi::vectorize(input->dims());
+  const auto& input_dim_size = input->dims().size();
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    for (size_t i = 0; i < input_dims.size(); i++) {
+      reduce_dims.push_back(static_cast<int>(i));
+    }
+  } else {
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) {
+        reduce_dims.push_back(dims[i] + input_dim_size);
+      } else {
+        reduce_dims.push_back(dims[i]);
+      }
+    }
+  }
+  MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                               ToCnnlDataType(input->dtype()));
+  MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                ToCnnlDataType(output->dtype()));
+  cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
+  MLUCnnlReduceDesc reduction_desc(reduce_dims, reduce_op, ToCnnlDataType<T>(),
+                                   CNNL_NOT_PROPAGATE_NAN,
+                                   CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+  MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                  nullptr, input_desc.get(), GetBasePtr(input),
+                  0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                  GetBasePtr(output));
+}
+}  // namespace operators
+}  // namespace paddle
+#endif
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class ReduceSumMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    MLUReduceOp<T>(context, "reduce_sum");
+  }
+};
+template <typename T>
+class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    in_grad->mutable_data<T>(context.GetPlace());
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    auto in_dims = phi::vectorize(in->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < in_dims.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+    for (auto& d : reduce_dims) {
+      if (d < 0) {
+        d = d + in_dims.size();
+      }
+    }
+    Tensor tmp_out(out_grad->dtype());
+    auto tmp_output_dims = in_dims;
+    for (auto d : reduce_dims) {
+      tmp_output_dims[d] = 1;
+    }
+    tmp_out.ShareDataWith(*out_grad);
+    tmp_out.Resize(phi::make_ddim(tmp_output_dims));
+    MLUCnnlTensorDesc out_desc(tmp_out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+    MLUCnnlTensorDesc in_grad_desc(*in_grad, CNNL_LAYOUT_ARRAY,
+                                   ToCnnlDataType<T>());
+    MLUCnnl::BroadcastTo(context, out_desc.get(), GetBasePtr(&tmp_out),
+                         in_grad_desc.get(), GetBasePtr(in_grad));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(reduce_sum, ops::ReduceSumMLUKernel<float>,
+                       ops::ReduceSumMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(reduce_sum_grad, ops::ReduceSumGradMLUKernel<float>,
+                       ops::ReduceSumGradMLUKernel<plat::float16>);
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+paddle.enable_static()
+SEED = 2022
+class TestElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Out': self.out}
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+    def init_dtype(self):
+        self.dtype = np.float32
+    def init_axis(self):
+        self.axis = 0
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            max_relative_error=0.005,
+            no_grad_set=set("X"))
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            max_relative_error=0.005,
+            no_grad_set=set('Y'))
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseSubOp_scalar(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+class TestElementwiseSubOp_Vector(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+class TestElementwiseSubOp_broadcast_0(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(100, 3, 2).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1)
+        }
+class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1)
+        }
+class TestElementwiseSubOp_broadcast_2(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100)
+        }
+class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+class TestElementwiseSubOp_broadcast_4(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 5, 3, 12).astype(np.float32),
+            'Y': np.random.rand(2, 5, 1, 12).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+class TestElementwiseSubOp_commonuse_1(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+class TestElementwiseSubOp_commonuse_2(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 1, 4).astype(np.float32),
+            'Y': np.random.rand(10, 1, 12, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 12).astype(np.float32),
+            'Y': np.random.rand(2, 3, 10, 12).astype(np.float32)
+        }
+        self.attrs = {'axis': 2}
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 12) - self.inputs['Y']
+        }
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+paddle.enable_static()
+class TestMLUReduceSumOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.set_mlu()
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].sum()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].sum(axis=self.axis,
+                                            keepdims=self.attrs['keep_dim'])
+            }
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+    def init_op_type(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (0, )
+class TestSumOp5D(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (1, 2, 5, 6, 10)
+        self.axis = (0, )
+class TestSumOp6D(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (1, 1, 2, 5, 6, 10)
+        self.axis = (0, )
+class TestSumOp8D(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
+        self.axis = (0, 3)
+class Test1DReduce(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = 120
+        self.axis = (0, )
+class Test2DReduce0(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (20, 10)
+        self.axis = (0, )
+class Test2DReduce1(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (20, 10)
+        self.axis = (1, )
+class Test3DReduce0(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (1, )
+class Test3DReduce1(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (2, )
+class Test3DReduce2(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (-2, )
+class Test3DReduce3(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (1, 2)
+class TestKeepDimReduce(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (1, )
+        self.keep_dim = True
+class TestKeepDim8DReduce(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
+        self.axis = (3, 4, 5)
+        self.keep_dim = True
+class TestReduceAll(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 2, 10)
+        self.axis = (0, )
+        self.reduce_all = True
+if __name__ == '__main__':
+    unittest.main()