delete paddle/fluid/operators/elementwise/*_npu.* (#52675)

delete paddle/fluid/operators/elementwise/_npu. (#52675)
599a201f · jjyaoao · GitHub · 0f3bbe10 · 0f3bbe10 · 0f3bbe10
11 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-
-    if (x_dims.size() == y_dims.size()) {
-      direct_compute = true;
-    } else if (x_dims.size() > y_dims.size()) {
-      direct_compute = x_dims.size() == (y_dims.size() + axis);
-    } else {
-      direct_compute = y_dims.size() == (x_dims.size() + axis);
-    }
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
-      runner.Run(dev_ctx.stream());
-    } else {
-      phi::DenseTensor transformed_x, transformed_y;
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-      const auto& runner =
-          NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    auto stream = dev_ctx.stream();
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        auto src_dims = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          phi::DenseTensor tmp;
-          tmp.ShareDataWith(*dx);
-          tmp.Resize(phi::make_ddim(dst_dims_vec));
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {*dout},
-                          {tmp},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        auto src_dims = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          phi::DenseTensor tmp;
-          tmp.ShareDataWith(*dy);
-          tmp.Resize(phi::make_ddim(dst_dims_vec));
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {*dout},
-                          {tmp},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(elementwise_add,
-                       ops::ElementwiseAddNPUKernel<float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseAddNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseAddNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
-                       ops::ElementwiseAddGradNPUKernel<float>,
-                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dx) {
-      dx->mutable_data<T>(place);
-
-      phi::DenseTensor tensor_one(y->type());
-      tensor_one.mutable_data<float>({1}, place);
-      FillNpuTensorWithConstant<float>(&tensor_one, static_cast<float>(1.0));
-
-      // Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
-      // Because `Power` will cause precision overflow, that is, `float_status`
-      // will be set to 1.
-      phi::DenseTensor y_div(y->type());
-      y_div.mutable_data<T>(y->dims(), place);
-      const auto& runner_one_div_y =
-          NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {});
-      runner_one_div_y.Run(stream);
-
-      phi::DenseTensor tensor_zeros(x->type());
-      tensor_zeros.mutable_data<T>(x->dims(), place);
-      const auto& runner_tensor_zeros =
-          NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
-      runner_tensor_zeros.Run(stream);
-
-      phi::DenseTensor x_zero(phi::DataType::BOOL);
-      x_zero.mutable_data<bool>(x->dims(), place);
-      const auto& runner_x_zero =
-          NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
-      runner_x_zero.Run(stream);
-
-      phi::DenseTensor x_nozero(phi::DataType::BOOL);
-      x_nozero.mutable_data<bool>(x->dims(), place);
-      const auto& runner_x_nonzero =
-          NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
-      runner_x_nonzero.Run(stream);
-
-      phi::DenseTensor x_nozero_f(x->type());
-      x_nozero_f.mutable_data<T>(x->dims(), place);
-      const auto& runner_x_nonzero_f =
-          NpuOpRunner("Cast",
-                      {x_nozero},
-                      {x_nozero_f},
-                      {{"dst_type", static_cast<int32_t>(0)}});
-      runner_x_nonzero_f.Run(stream);
-
-      phi::DenseTensor x_grad_w(x->type());
-      x_grad_w.mutable_data<T>(x->dims(), place);
-      const auto& runner_x_grad_w =
-          NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {});
-      runner_x_grad_w.Run(stream);
-
-      const auto& runner_x_grad =
-          NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
-      runner_x_grad.Run(stream);
-    }
-
-    if (dy) {
-      dy->mutable_data<T>(place);
-
-      phi::DenseTensor neg_out(out->type());
-      neg_out.mutable_data<T>(out->dims(), place);
-      const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
-      runner_neg_out.Run(stream);
-
-      phi::DenseTensor tmp_mul(out->type());
-      tmp_mul.mutable_data<T>(out->dims(), place);
-      const auto& runner_mul =
-          NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {});
-      runner_mul.Run(stream);
-
-      if (dy->dims() != dout->dims()) {
-        phi::DenseTensor reduced_tmp_mul(y->type());
-        reduced_tmp_mul.mutable_data<T>(y->dims(), place);
-
-        std::vector<int64_t> axes;
-        int64_t diff = dout->dims().size() - dy->dims().size();
-        for (int64_t i = 0; i < dout->dims().size(); ++i) {
-          if (i < diff) {
-            axes.push_back(i);
-            continue;
-          }
-          if (dout->dims()[i] > dy->dims()[i - diff]) {
-            axes.push_back(i);
-          }
-        }
-        const auto& runner_reduce =
-            NpuOpRunner("ReduceSumD",
-                        {tmp_mul},
-                        {reduced_tmp_mul},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner_reduce.Run(stream);
-
-        const auto& runner_y_grad =
-            NpuOpRunner("Div", {reduced_tmp_mul, *y}, {*dy}, {});
-        runner_y_grad.Run(stream);
-      } else {
-        const auto& runner_y_grad =
-            NpuOpRunner("Div", {tmp_mul, *y}, {*dy}, {});
-        runner_y_grad.Run(stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(elementwise_floordiv,
-                       ops::ElementwiseFloorDivNPUKernel<int>,
-                       ops::ElementwiseFloorDivNPUKernel<int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor transformed_x, transformed_y;
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-      const auto& runner =
-          NpuOpRunner("Maximum", {transformed_x, transformed_y}, {*out}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    // The ascend elementwise_max_grad op only supports broadcast
-    // when axis is -1, and requires all the inputs must have the
-    // same shape when axis is not -1. For convenience, we should
-    // broadcast the original input x and y to transformed_x and
-    // transformed_x firstly, then use tmp tensor to get the op
-    // output, last reduce the tmp tensor shape to match the
-    // paddle output.
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    phi::DenseTensor transformed_x, transformed_y;
-    NpuElementWiseOpBroadcast<T>(
-        dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-
-    auto dout_dims = dout->dims();
-    auto stream = dev_ctx.stream();
-    framework::NPUAttributeMap attr_input = {{"grad_x", true},
-                                             {"grad_y", true}};
-    // Reshape info vector.
-    std::vector<int> reduce_axes;
-
-    if (dx && dy) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_dx;
-      tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-      phi::DenseTensor tmp_dy;
-      tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("MaximumGrad",
-                                       {*dout, transformed_x, transformed_y},
-                                       {tmp_dx, tmp_dy},
-                                       attr_input);
-      runner.Run(stream);
-
-      if (x_dims != dout_dims) {
-        reduce_axes.clear();
-        int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
-              (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dx},
-                          {*dx},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx);
-      }
-
-      if (y_dims != dout_dims) {
-        reduce_axes.clear();
-        int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
-              (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dy},
-                          {*dy},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy);
-      }
-
-    } else if (dx) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dx->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_dx;
-      tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("MaximumGrad",
-                                       {*dout, transformed_x, transformed_y},
-                                       {tmp_dx, zero_tensor},
-                                       attr_input);
-      runner.Run(stream);
-
-      if (x_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
-              (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dx},
-                          {*dx},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx);
-      }
-
-    } else if (dy) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_dy;
-      tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("MaximumGrad",
-                                       {*dout, transformed_x, transformed_y},
-                                       {zero_tensor, tmp_dy},
-                                       attr_input);
-      runner.Run(stream);
-
-      if (y_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
-              (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dy},
-                          {*dy},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Do not support all outputs to be empty."));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, int>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, int>);
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    int axis = ctx.Attr<int>("axis");
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-    phi::DenseTensor transformed_x, transformed_y;
-    if (direct_compute) {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    } else {
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-    }
-    const auto& runner =
-        NpuOpRunner("Minimum", {transformed_x, transformed_y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    auto stream = dev_ctx.stream();
-    if (dx && dy) {
-      // dx
-      dx->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_x;
-      tmp_x.ShareDataWith(*dx);
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_x;
-        std::vector<int> reduce_axes_x;
-        auto src_dims_x = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
-              (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
-            reduce_axes_x.push_back(ax);
-          } else {
-            dst_dims_vec_x.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_x.empty()) {
-          tmp_x.Resize(phi::make_ddim(dst_dims_vec_x));
-        }
-      }
-      // dy
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_y;
-      tmp_y.ShareDataWith(*dy);
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_y;
-        std::vector<int> reduce_axes_y;
-        auto src_dims_y = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
-              (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
-            reduce_axes_y.push_back(ax);
-          } else {
-            dst_dims_vec_y.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_y.empty()) {
-          tmp_y.Resize(phi::make_ddim(dst_dims_vec_y));
-        }
-      }
-
-      const auto& runner = NpuOpRunner("MinimumGrad",
-                                       {*dout, *x, *y},
-                                       {tmp_x, tmp_y},
-                                       {{"grad_x", true}, {"grad_y", true}});
-      runner.Run(stream);
-
-    } else if (dx) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(y->dims(), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-      // dx
-      dx->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_x;
-      tmp_x.ShareDataWith(*dx);
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_x;
-        std::vector<int> reduce_axes_x;
-        auto src_dims_x = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
-              (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
-            reduce_axes_x.push_back(ax);
-          } else {
-            dst_dims_vec_x.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_x.empty()) {
-          tmp_x.Resize(phi::make_ddim(dst_dims_vec_x));
-        }
-      }
-
-      const auto& runner = NpuOpRunner("MinimumGrad",
-                                       {*dout, *x, *y},
-                                       {tmp_x, zero_tensor},
-                                       {{"grad_x", true}, {"grad_y", true}});
-      runner.Run(stream);
-
-    } else if (dy) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      // dy
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_y;
-      tmp_y.ShareDataWith(*dy);
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_y;
-        std::vector<int> reduce_axes_y;
-        auto src_dims_y = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
-              (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
-            reduce_axes_y.push_back(ax);
-          } else {
-            dst_dims_vec_y.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_y.empty()) {
-          tmp_y.Resize(phi::make_ddim(dst_dims_vec_y));
-        }
-      }
-
-      const auto& runner = NpuOpRunner("MinimumGrad",
-                                       {*dout, *x, *y},
-                                       {zero_tensor, tmp_y},
-                                       {{"grad_x", true}, {"grad_y", true}});
-      runner.Run(stream);
-
-    } else {
-      std::cout << "error" << std::endl;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseModNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-
-    bool direct_compute = false;
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-
-    phi::DenseTensor transformed_x, transformed_y;
-    if (direct_compute) {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    } else {
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("FloorMod", {transformed_x, transformed_y}, {*out}, {});
-    auto stream = dev_ctx.stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_mod,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const int axis,
-                       const framework::DDim& ddims,
-                       const framework::DDim& brd_ddims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t brd_size = brd_ddims.size();
-  int64_t org_size = ddims.size();
-  // int64_t diff = brd_dims.size() - dims.size();
-  for (int64_t i = 0; i < brd_size; ++i) {
-    if (i < axis || i >= org_size + axis) {
-      axes.push_back(i);
-      continue;
-    }
-    if (brd_ddims[i] > ddims[i - axis]) {
-      axes.push_back(i);
-    }
-  }
-  // LOG(INFO) << "axes = " << phi::make_ddim(axes).to_str();
-  out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner(
-      "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
-  runner.Run(stream);
-}
-
-template <typename T>
-class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = x_dims.size() == (y_dims.size() + axis);
-    } else {
-      direct_compute = y_dims.size() == (x_dims.size() + axis);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor trans_x, trans_y;
-      NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
-      const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    phi::DenseTensor trans_x, trans_y;
-    NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
-
-    if (dx) {
-      if (dx->dims() == dout->dims()) {
-        dx->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
-        runner_dx.Run(stream);
-      } else {
-        phi::DenseTensor dx_temp(x->type());
-        dx_temp.Resize(trans_x.dims());
-        dx_temp.mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dx =
-            NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {});
-        runner_dx.Run(stream);
-        ReduceDims<T>(
-            ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, dx);
-      }
-    }
-    if (dy) {
-      if (dy->dims() == dout->dims()) {
-        dy->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
-        runner_dy.Run(stream);
-      } else {
-        phi::DenseTensor dy_temp(y->type());
-        dy_temp.Resize(trans_y.dims());
-        dy_temp.mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy =
-            NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {});
-        runner_dy.Run(stream);
-        ReduceDims<T>(
-            ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, dy);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(elementwise_mul,
-                       ops::ElementwiseMulNPUKernel<float>,
-                       ops::ElementwiseMulNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseMulNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseMulNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradNPUKernel<float>,
-    ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ElementwiseMulGradNPUKernel<int64_t>,
-#endif
-    ops::ElementwiseMulGradNPUKernel<int>);
--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
-                  const phi::DenseTensor* src,
-                  int axis,
-                  const framework::DDim& dst_dims,
-                  phi::DenseTensor* transformed_src) {
-  auto stream = dev_ctx.stream();
-
-  // 1. expand the axis with dim 1
-  auto src_dims = src->dims();
-  phi::DenseTensor tmp_src;
-  tmp_src.ShareDataWith(*src);
-  tmp_src.Resize(src_dims);
-  for (int i = 0; i < src_dims.size(); ++i) {
-    if (src_dims[i] == 1 && dst_dims[i + axis] > 1) {
-      phi::DenseTensor tmp_tensor;
-      auto tmp_tensor_dims = tmp_src.dims();
-      tmp_tensor_dims[i] = dst_dims[i + axis];
-      tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("TileWithAxis",
-                      {tmp_src},
-                      {tmp_tensor},
-                      {{"axis", static_cast<int64_t>(i)},
-                       {"tiles", static_cast<int64_t>(dst_dims[i + axis])}});
-      runner.Run(stream);
-      tmp_src.ShareDataWith(tmp_tensor);
-      tmp_src.Resize(tmp_tensor_dims);
-    }
-  }
-
-  // 2.expand the ahead axis
-  auto prev = phi::product(phi::slice_ddim(dst_dims, 0, axis));
-  if (prev > 1) {
-    phi::DenseTensor tmp_tensor;
-    auto tmp_tensor_dims = phi::slice_ddim(dst_dims, 0, axis + src_dims.size());
-    tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("ExpandD",
-                    {tmp_src},
-                    {tmp_tensor},
-                    {{"shape", phi::vectorize<int64_t>(tmp_tensor_dims)}});
-    runner.Run(stream);
-    tmp_src.ShareDataWith(tmp_tensor);
-    tmp_src.Resize(tmp_tensor_dims);
-  } else {
-    tmp_src.Resize(phi::slice_ddim(dst_dims, 0, axis + src_dims.size()));
-  }
-
-  // 3.expand the tail axis
-  auto post = phi::product(
-      phi::slice_ddim(dst_dims, axis + src_dims.size(), dst_dims.size()));
-  if (post > 1) {
-    auto src_dims_vec = phi::vectorize<int>(tmp_src.dims());
-    src_dims_vec.push_back(1);
-    tmp_src.Resize(phi::make_ddim(src_dims_vec));
-
-    phi::DenseTensor tmp_tensor;
-    tmp_tensor.mutable_data<T>(dst_dims, dev_ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("TileWithAxis",
-                    {tmp_src},
-                    {tmp_tensor},
-                    {{"axis", static_cast<int64_t>(axis + src_dims.size())},
-                     {"tiles", static_cast<int64_t>(post)}});
-    runner.Run(stream);
-    tmp_src.ShareDataWith(tmp_tensor);
-  }
-  tmp_src.Resize(dst_dims);
-  framework::TensorCopy(tmp_src, dev_ctx.GetPlace(), transformed_src);
-}
-
-template <typename T>
-void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx,
-                               const phi::DenseTensor* x,
-                               const phi::DenseTensor* y,
-                               int axis,
-                               phi::DenseTensor* transformed_x,
-                               phi::DenseTensor* transformed_y) {
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  std::vector<int> dst_dims_vec = phi::vectorize<int>(x_dims);
-
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-    dst_dims_vec = phi::vectorize<int>(y_dims);
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  int x_axis = is_xsize_larger ? 0 : axis;
-  int y_axis = is_xsize_larger ? axis : 0;
-
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LE(
-      axis,
-      max_dim,
-      platform::errors::InvalidArgument(
-          "Axis should be less than or equal to %d, but received axis is %d.",
-          max_dim,
-          axis));
-
-  for (int i = 0; i < x_dims.size(); ++i) {
-    dst_dims_vec[i + x_axis] =
-        std::max(dst_dims_vec[i + x_axis], static_cast<int>(x_dims[i]));
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    dst_dims_vec[i + y_axis] =
-        std::max(dst_dims_vec[i + y_axis], static_cast<int>(y_dims[i]));
-  }
-
-  auto dst_dims = phi::make_ddim(dst_dims_vec);
-  NpuBroadcast<T>(dev_ctx, x, x_axis, dst_dims, transformed_x);
-  NpuBroadcast<T>(dev_ctx, y, y_axis, dst_dims, transformed_y);
-}
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP_ITSELF(elementwise_sub);
-USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
-
-template <typename T>
-void Compare(f::Scope *scope,
-             const p::DeviceContext &ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto y = scope->Var("Y");
-  auto tensor_y = y->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  std::vector<T> init_y;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_y.push_back(static_cast<T>(2.0));
-  }
-
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-  paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
-  tensor_y->Resize({10, 10});
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      op_type, {{"X", {"X"}}, {"Y", {"Y"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-  float expected = 0.0;
-  if (op_type == "elementwise_add") {
-    expected = 3.0;
-  } else if (op_type == "elementwise_sub") {
-    expected = -1.0;
-  }
-  EXPECT_EQ(out_vec.size(), init_x.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
-  }
-}
-
-template <typename T>
-void CompareGrad(f::Scope *scope,
-                 const p::DeviceContext &ctx,
-                 std::string op_type) {
-  // init
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-  tensor_dout->Resize({2, 3, 5});
-
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  tensor_x->Resize({2, 3, 5});
-
-  auto y = scope->Var("Y");
-  auto tensor_y = y->GetMutable<phi::DenseTensor>();
-  tensor_y->Resize({1, 5});
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  auto dy = scope->Var("DY");
-  auto tensor_dy = dy->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_dout;
-  for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
-    init_dout.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize({2, 3, 5});
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      op_type,
-      {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
-      {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}},
-      attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-  std::vector<T> dy_vec;
-  paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
-
-  ctx.Wait();
-  float expected_x = 0, expected_y = 0;
-  if (op_type == "elementwise_add_grad") {
-    expected_x = 1.0;
-    expected_y = 6.0;
-  } else if (op_type == "elementwise_sub_grad") {
-    expected_x = 1.0;
-    expected_y = -6.0;
-  }
-
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
-  }
-  for (uint32_t i = 0; i < dy_vec.size(); i++) {
-    EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
-  }
-}
-
-TEST(elementwise_add, NPU_fp32) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "elementwise_add");
-}
-
-TEST(elementwise_sub, NPU_fp32) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "elementwise_sub");
-}
-
-TEST(elementwise_sub, NPU_fp16) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
-}
-
-TEST(elementwise_sub_grad, NPU) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
-}
-
-TEST(elementwise_add_grad, NPU) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
-}
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwisePowNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    int axis = ctx.Attr<int>("axis");
-
-    out->mutable_data<T>(place);
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis =
-        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-
-    auto stream = dev_ctx.stream();
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor transformed_x, transformed_y;
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-      const auto& runner =
-          NpuOpRunner("Pow", {transformed_x, transformed_y}, {*out}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto place = ctx.GetPlace();
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis =
-        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    phi::DenseTensor transformed_x, transformed_y;
-    NpuElementWiseOpBroadcast<T>(
-        dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-
-    auto dout_dims = dout->dims();
-    auto stream = dev_ctx.stream();
-    // Reshape info vector.
-    std::vector<int> reduce_axes;
-    if (dx) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, place);
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dx->mutable_data<T>(place);
-      phi::DenseTensor tmp_dx;
-      tmp_dx.mutable_data<T>(dout_dims, place);
-
-      // dx = dout * y * pow(x, y - 1);
-      phi::DenseTensor PowGrad_dx_temp1(dout->type());
-      PowGrad_dx_temp1.mutable_data<T>(dout->dims(), place);
-      const auto& runner_PowGrad_dx_temp1 =
-          NpuOpRunner("Mul", {*dout, transformed_y}, {PowGrad_dx_temp1}, {});
-      runner_PowGrad_dx_temp1.Run(stream);
-
-      phi::DenseTensor one_dx(transformed_y.type());
-      one_dx.mutable_data<T>(transformed_y.dims(), place);
-      const auto& runner_one_dx =
-          NpuOpRunner("OnesLike", {transformed_y}, {one_dx}, {});
-      runner_one_dx.Run(stream);
-
-      phi::DenseTensor sub_dx(transformed_y.type());
-      sub_dx.mutable_data<T>(transformed_y.dims(), place);
-      const auto& runner_sub_dx =
-          NpuOpRunner("Sub", {transformed_y, one_dx}, {sub_dx}, {});
-      runner_sub_dx.Run(stream);
-
-      phi::DenseTensor PowGrad_dx_temp2(transformed_x.type());
-      PowGrad_dx_temp2.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_PowGrad_dx_temp2 =
-          NpuOpRunner("Pow", {transformed_x, sub_dx}, {PowGrad_dx_temp2}, {});
-      runner_PowGrad_dx_temp2.Run(stream);
-
-      const auto& runner_dx = NpuOpRunner(
-          "Mul", {PowGrad_dx_temp1, PowGrad_dx_temp2}, {tmp_dx}, {});
-      runner_dx.Run(stream);
-
-      if (x_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
-              (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dx},
-                          {*dx},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dx, place, dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, place);
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dy->mutable_data<T>(place);
-      phi::DenseTensor tmp_dy;
-      tmp_dy.mutable_data<T>(dout_dims, place);
-
-      // dy = dout * log(x) * pow(x, y)
-      phi::DenseTensor PowGrad_dy_temp1(transformed_x.type());
-      PowGrad_dy_temp1.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_PowGrad_dy_temp1 = NpuOpRunner(
-          "Pow", {transformed_x, transformed_y}, {PowGrad_dy_temp1}, {});
-      runner_PowGrad_dy_temp1.Run(stream);
-
-      phi::DenseTensor one_dy(transformed_x.type());
-      one_dy.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_one_dy =
-          NpuOpRunner("OnesLike", {transformed_x}, {one_dy}, {});
-      runner_one_dy.Run(stream);
-
-      phi::DenseTensor sub_dy(transformed_x.type());
-      sub_dy.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_sub_dy =
-          NpuOpRunner("Sub", {transformed_x, one_dy}, {sub_dy}, {});
-      runner_sub_dy.Run(stream);
-
-      phi::DenseTensor log_dy(transformed_x.type());
-      log_dy.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_log_dy = NpuOpRunner("Log1p", {sub_dy}, {log_dy}, {});
-      runner_log_dy.Run(stream);
-
-      phi::DenseTensor PowGrad_dy_temp2(transformed_x.type());
-      PowGrad_dy_temp2.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_PowGrad_dy_temp2 = NpuOpRunner(
-          "Mul", {log_dy, PowGrad_dy_temp1}, {PowGrad_dy_temp2}, {});
-      runner_PowGrad_dy_temp2.Run(stream);
-
-      const auto& runner_dy =
-          NpuOpRunner("Mul", {*dout, PowGrad_dy_temp2}, {tmp_dy}, {});
-      runner_dy.Run(stream);
-
-      if (y_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
-              (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dy},
-                          {*dy},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dy, place, dev_ctx, dy);
-      }
-    }
-    if (!dx && !dy) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Not support all outputs to be empty."));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, int>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_pow_grad,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, int>);
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
-    // default axis=-1?
-    // So, the sub_grad should do reduce if needed.
-    // For example, the shape of each variable in elementwise_sub:
-    // x, dx: [2, 3, 5]
-    // y, dy: [1, 5]
-    // out, dout: [2, 3, 5]
-    // Then, out = x - y  =>  dx = dout, dy = -dout
-    // And, the shape of dy can be computed by two stages reduce,
-    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
-    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
-
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      // For dx
-      // stage 1
-      auto reduce_ndim = dout->dims().size() - dx->dims().size();
-      std::vector<int> axes;
-      for (auto i = 0; i < reduce_ndim; ++i) {
-        axes.push_back(i);
-      }
-      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      phi::DenseTensor reduced_dout(dx->type());
-      if (axes.size() != 0) {
-        std::vector<int64_t> reduced_dout_dims;
-        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-          reduced_dout_dims.push_back(dout->dims()[i]);
-        }
-        reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-        reduced_dout.mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("ReduceSumD",
-                        {*dout},
-                        {reduced_dout},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner.Run(stream);
-        tmp_dout = &reduced_dout;
-      }
-
-      // stage 2
-      axes.clear();
-      for (auto i = 0; i < dx->dims().size(); ++i) {
-        if (dx->dims()[i] == 1) {
-          axes.push_back(i);
-        }
-      }
-      if (axes.size() != 0) {
-        const auto& runner = NpuOpRunner("ReduceSumD",
-                                         {*tmp_dout},
-                                         {*dx},
-                                         {{"axes", axes}, {"keep_dims", true}});
-        runner.Run(stream);
-      } else {
-        framework::TensorCopy(
-            *tmp_dout,
-            ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(),
-            dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      // For dy
-      // stage 1
-      auto reduce_ndim = dout->dims().size() - dy->dims().size();
-      std::vector<int> axes;
-      for (auto i = 0; i < reduce_ndim; ++i) {
-        axes.push_back(i);
-      }
-      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      phi::DenseTensor reduced_dy(dy->type());
-      phi::DenseTensor reduced_dout(dy->type());
-
-      if (axes.size() != 0) {
-        std::vector<int64_t> reduced_dout_dims;
-        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-          reduced_dout_dims.push_back(dout->dims()[i]);
-        }
-        reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-        reduced_dout.mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("ReduceSumD",
-                        {*dout},
-                        {reduced_dout},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner.Run(stream);
-        tmp_dout = &reduced_dout;
-      }
-
-      // stage 2
-      axes.clear();
-      phi::DenseTensor* tmp_dy = tmp_dout;
-      for (auto i = 0; i < dy->dims().size(); ++i) {
-        if (dy->dims()[i] == 1) {
-          axes.push_back(i);
-        }
-      }
-      if (axes.size() != 0) {
-        reduced_dy.Resize(dy->dims());
-        reduced_dy.mutable_data<T>(ctx.GetPlace());
-        const auto& runner = NpuOpRunner("ReduceSumD",
-                                         {*tmp_dout},
-                                         {reduced_dy},
-                                         {{"axes", axes}, {"keep_dims", true}});
-        runner.Run(stream);
-        tmp_dy = &reduced_dy;
-      }
-
-      // stage 3, negative
-      const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(elementwise_sub,
-                       ops::ElementwiseSubNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseSubNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseSubNPUKernel<float>,
-                       ops::ElementwiseSubNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
-                       ops::ElementwiseSubGradNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseSubGradNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseSubGradNPUKernel<float>,
-                       ops::ElementwiseSubGradNPUKernel<plat::float16>);