delete paddle/fluid/operators/math,metrics,optimizers,reduce_ops/*_npu.* (#52674)

delete paddle/fluid/operators/math,metrics,optimizers,reduce_ops/_npu. (#52674)
a6aa701e · jjyaoao · GitHub · b451aff8 · b451aff8 · b451aff8
15 changed file
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AccuracyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<phi::DenseTensor>("Out");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-
-    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
-    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
-    auto* total = ctx.Output<phi::DenseTensor>("Total");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    int num_samples = inference->dims()[0];
-    if (num_samples == 0) {
-      return;
-    }
-
-    // cast `indices` or `label` if their type is not consistent
-    Tensor cast_indices(phi::DataType::INT32);
-    Tensor cast_label(phi::DataType::INT32);
-    if (indices->dtype() != label->dtype()) {
-      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-      if (framework::TransToProtoVarType(indices->dtype()) !=
-          framework::proto::VarType::INT32) {
-        cast_indices.Resize(indices->dims());
-        cast_indices.mutable_data<int>(ctx.GetPlace());
-        const auto& runner_cast_indices =
-            NpuOpRunner("Cast",
-                        {*indices},
-                        {cast_indices},
-                        {{"dst_type", static_cast<int>(dst_dtype)}});
-        runner_cast_indices.Run(stream);
-      } else {
-        cast_indices.ShareDataWith(*indices);
-      }
-      if (framework::TransToProtoVarType(label->dtype()) !=
-          framework::proto::VarType::INT32) {
-        cast_label.Resize(label->dims());
-        cast_label.mutable_data<int>(ctx.GetPlace());
-        const auto& runner_cast_label =
-            NpuOpRunner("Cast",
-                        {*label},
-                        {cast_label},
-                        {{"dst_type", static_cast<int>(dst_dtype)}});
-        runner_cast_label.Run(stream);
-      } else {
-        cast_label.ShareDataWith(*label);
-      }
-    } else {
-      cast_indices.ShareDataWith(*indices);
-      cast_label.ShareDataWith(*label);
-    }
-
-    // equal
-    Tensor tmp_equal(phi::DataType::BOOL);
-    tmp_equal.Resize(inference->dims());
-    tmp_equal.mutable_data<bool>(ctx.GetPlace());
-    const auto& runner_equal =
-        NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
-    runner_equal.Run(stream);
-
-    // cast equal
-    Tensor tmp_equal_cast(phi::DataType::FLOAT32);
-    tmp_equal_cast.Resize(inference->dims());
-    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_cast_equal = NpuOpRunner(
-        "Cast",
-        {tmp_equal},
-        {tmp_equal_cast},
-        {{"dst_type",
-          static_cast<int>(ConvertToNpuDtype(
-              framework::TransToProtoVarType(tmp_equal_cast.dtype())))}});
-    runner_cast_equal.Run(stream);
-
-    // [correct]
-    // reduce_max
-    Tensor tmp_correct_max(phi::DataType::FLOAT32);
-    tmp_correct_max.Resize(phi::make_ddim({num_samples}));
-    tmp_correct_max.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_reduce_max =
-        NpuOpRunner("ReduceMaxD",
-                    {tmp_equal_cast},
-                    {tmp_correct_max},
-                    {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
-    runner_reduce_max.Run(stream);
-
-    // reduce_sum
-    Tensor tmp_correct(phi::DataType::FLOAT32);
-    tmp_correct.Resize(correct->dims());
-    tmp_correct.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_reduce_sum =
-        NpuOpRunner("ReduceSumD",
-                    {tmp_correct_max},
-                    {tmp_correct},
-                    {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
-    runner_reduce_sum.Run(stream);
-
-    // cast to int
-    correct->mutable_data<int>(ctx.GetPlace());
-    const auto& runner_cast_correct =
-        NpuOpRunner("Cast",
-                    {tmp_correct},
-                    {*correct},
-                    {{"dst_type",
-                      static_cast<int>(ConvertToNpuDtype(
-                          framework::TransToProtoVarType(correct->dtype())))}});
-    runner_cast_correct.Run(stream);
-
-    // [total]
-    total->mutable_data<int>(ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
-
-    // use `total` of type `float32` for calculating accuracy
-    Tensor tmp_total(phi::DataType::FLOAT32);
-    tmp_total.Resize(total->dims());
-    tmp_total.mutable_data<float>(ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&tmp_total,
-                                     static_cast<float>(num_samples));
-
-    // [accuracy]
-    accuracy->mutable_data<float>(ctx.GetPlace());
-    const auto& runner_accuracy =
-        NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
-    runner_accuracy.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    accuracy,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* param = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Grad(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* grad = ctx.Input<phi::DenseTensor>("Grad");
-    auto* mom1 = ctx.Input<phi::DenseTensor>("Moment1");
-    auto* mom2 = ctx.Input<phi::DenseTensor>("Moment2");
-    auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-
-    auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
-
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto* mom1_out = ctx.Output<phi::DenseTensor>("Moment1Out");
-    auto* mom2_out = ctx.Output<phi::DenseTensor>("Moment2Out");
-    auto* beta1_pow_out = ctx.Output<phi::DenseTensor>("Beta1PowOut");
-    auto* beta2_pow_out = ctx.Output<phi::DenseTensor>("Beta2PowOut");
-
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      skip_update = skip_update_vec[0];
-    }
-    // skip_update=true, just copy input to output, and TensorCopy will call
-    // mutable_data
-    if (skip_update) {
-      VLOG(4) << "Adam skip update";
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom1_out);
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom2_out);
-      framework::TensorCopy(
-          *beta1_pow,
-          beta1_pow->place(),
-          ctx.template device_context<platform::DeviceContext>(),
-          beta1_pow_out);
-      framework::TensorCopy(
-          *beta2_pow,
-          beta2_pow->place(),
-          ctx.template device_context<platform::DeviceContext>(),
-          beta2_pow_out);
-      return;
-    }
-
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    mom1_out->mutable_data<T>(ctx.GetPlace());
-    mom2_out->mutable_data<T>(ctx.GetPlace());
-
-    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
-    // place.
-    phi::DenseTensor beta1_pow_tmp;
-    phi::DenseTensor beta2_pow_tmp;
-    if (beta1_pow->place() == platform::CPUPlace()) {
-      T beta1 = *beta1_pow->data<T>();
-      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
-      beta1_pow = &beta1_pow_tmp;
-    }
-    if (beta2_pow->place() == platform::CPUPlace()) {
-      T beta2 = *beta2_pow->data<T>();
-      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
-      beta2_pow = &beta2_pow_tmp;
-    }
-
-    const phi::DenseTensor* beta1_tensor = nullptr;
-    const phi::DenseTensor* beta2_tensor = nullptr;
-    const phi::DenseTensor* epsilon_tensor = nullptr;
-
-    phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-
-    if (ctx.HasInput("Beta1Tensor")) {
-      beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
-      PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta1Tensor) size must be 1, but get %d",
-                            beta1_tensor->numel()));
-    } else {
-      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta1_tmp, beta1);
-      beta1_tensor = &beta1_tmp;
-    }
-
-    if (ctx.HasInput("Beta2Tensor")) {
-      beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
-      PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta2Tensor) size must be 1, but get %d",
-                            beta2_tensor->numel()));
-    } else {
-      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta2_tmp, beta2);
-      beta2_tensor = &beta2_tmp;
-    }
-
-    if (ctx.HasInput("EpsilonTensor")) {
-      epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
-      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(EpsilonTensor) size must be 1, but get %d",
-                            epsilon_tensor->numel()));
-    } else {
-      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
-      epsilon_tensor = &epsilon_tmp;
-    }
-
-    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
-            << "beta2_pow.numel() : " << beta2_pow->numel();
-    VLOG(3) << "param.numel(): " << param->numel();
-
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ApplyAdamD",
-                                     {
-                                         *param,
-                                         *mom1,
-                                         *mom2,
-                                         *beta1_pow,
-                                         *beta2_pow,
-                                         *lr,
-                                         *beta1_tensor,
-                                         *beta2_tensor,
-                                         *epsilon_tensor,
-                                         *grad,
-                                     },
-                                     {
-                                         *param_out,
-                                         *mom1_out,
-                                         *mom2_out,
-                                     },
-                                     {});
-    runner.Run(stream);
-
-    // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
-    // if param and param_out is not same, we need to do copy.
-    if (param_out->data<T>() != param->data<T>()) {
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-    }
-    if (mom1_out->data<T>() != mom1->data<T>()) {
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom1_out);
-    }
-    if (mom2_out->data<T>() != mom2->data<T>()) {
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom2_out);
-    }
-    if (!use_global_beta_pow) {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      const auto& runner_m1 =
-          NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
-      runner_m1.Run(stream);
-      const auto& runner_m2 =
-          NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
-      runner_m2.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    VLOG(3) << "NPU AdamW Kernel";
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      VLOG(3) << "Has SkipUpdate";
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      skip_update = skip_update_vec[0];
-    }
-    VLOG(3) << "Skip update" << skip_update;
-    bool with_decay = ctx.Attr<bool>("with_decay");
-    if (!skip_update && with_decay) {
-      float coeff = ctx.Attr<float>("coeff");
-      auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-
-      auto place = ctx.GetPlace();
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-
-      phi::DenseTensor one(phi::DataType::FLOAT32);
-      phi::DenseTensor decay(phi::DataType::FLOAT32);
-      phi::DenseTensor tmp(phi::DataType::FLOAT32);
-
-      tmp.mutable_data<float>({1}, place);
-      one.mutable_data<float>({1}, place);
-      decay.mutable_data<float>({1}, place);
-
-      FillNpuTensorWithConstant<float>(&one, 1.0f);
-      framework::NPUAttributeMap attr_input = {{"value", coeff}};
-
-      const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input);
-      runner1.Run(stream);
-
-      const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {});
-      runner2.Run(stream);
-
-      if (ctx.HasInput("MasterParam")) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Master Parma is not supported on npu"));
-      } else {
-        auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-        param_out->mutable_data<T>(ctx.GetPlace());
-
-        const auto* param_var = ctx.InputVar("Param");
-        PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "The Var(%s)'s type should be phi::DenseTensor, "
-                              "but the received is %s",
-                              ctx.InputNames("Param").front(),
-                              framework::ToTypeName(param_var->Type())));
-        auto* param = ctx.Input<phi::DenseTensor>("Param");
-
-        const auto& runner =
-            NpuOpRunner("Mul",
-                        {*param, decay},
-                        {*const_cast<phi::DenseTensor*>(param)},
-                        {});
-        runner.Run(stream);
-      }
-    }
-    AdamNPUKernel<platform::NPUDeviceContext, T>::Compute(ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    adam,
-    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(adamw,
-                       ops::AdamWNPUKernel<float>,
-                       ops::AdamWNPUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    size_t n = params.size();
-    PADDLE_ENFORCE_EQ(n,
-                      params_out.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Output(ParamOut) must be equal to "
-                          "Input(Param), but got the size of Output(ParamOut) "
-                          "is %d, the size of Input(Param) is %d.",
-                          params_out.size(),
-                          n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(params[i],
-                        params_out[i],
-                        platform::errors::InvalidArgument(
-                            "The size of Input(Param) and Output(ParamOut) "
-                            "must be the same Tensors."));
-    }
-
-    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
-    PADDLE_ENFORCE_EQ(
-        n,
-        grads.size(),
-        platform::errors::InvalidArgument(
-            "The size of Input(Grad) must be equal to Input(Param), but got "
-            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
-            grads.size(),
-            n));
-
-    auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    PADDLE_ENFORCE_EQ(n,
-                      velocitys.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Velocity) must be equal to "
-                          "Input(Param), but got the size of Input(Velocity) "
-                          "is %d, the size of Input(Param) is %d.",
-                          velocitys.size(),
-                          n));
-
-    auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    PADDLE_ENFORCE_EQ(
-        n,
-        velocitys_out.size(),
-        platform::errors::InvalidArgument(
-            "The size of Output(VelocityOut) must be "
-            "equal to Input(Param), but got the size of Output(VelocityOut) is "
-            "%d, the size of Input(Param) is %d.",
-            velocitys_out.size(),
-            n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(velocitys[i],
-                        velocitys_out[i],
-                        platform::errors::InvalidArgument(
-                            "Input(Velocity) and Output(VelocityOut) must be "
-                            "the same Tensors."));
-    }
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    if (lrs.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          lrs.size(),
-          platform::errors::InvalidArgument(
-              "If the size of Input(LearningRate) is not 1, the size of "
-              "Input(LearningRate) must be "
-              "equal to Input(Param), but got the size of Input(LearningRate) "
-              "is %d, the size of Input(Param) is %d.",
-              lrs.size(),
-              n));
-    }
-    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto regularization_methods =
-        ctx.Attr<std::vector<std::string>>("regularization_method");
-    auto regularization_coeffs =
-        ctx.Attr<std::vector<float>>("regularization_coeff");
-    if (regularization_methods.size() != 0) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_methods.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_method) must be equal "
-              "to Input(Param), but got the size of "
-              "Attr(regularization_method) is %d, the size of Input(Param) is "
-              "%d.",
-              regularization_methods.size(),
-              n));
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_coeffs.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_coeff) must be equal "
-              "to Input(Param), but got the size of Attr(regularization_coeff) "
-              "is %d, the size of Input(Param) is %d.",
-              regularization_coeffs.size(),
-              n));
-    }
-
-    VLOG(5) << "use_nesterov: " << use_nesterov
-            << ",  regularization_methods.size(): "
-            << regularization_methods.size()
-            << ",  regularization_coeffs.size(): "
-            << regularization_coeffs.size();
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-
-    Tensor mu_tensor;
-    mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&mu_tensor, mu);
-
-    for (size_t idx = 0; idx < n; ++idx) {
-      phi::RegularizationType regularization_flag =
-          regularization_methods.size() > 0 &&
-                  regularization_methods[idx] == "l2_decay"
-              ? phi::RegularizationType::kL2DECAY
-              : phi::RegularizationType::kNONE;
-      float regularization_coeff = 0.0;
-      if (regularization_coeffs.size() != 0) {
-        regularization_coeff = regularization_coeffs[idx];
-      }
-
-      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
-      auto param = params[idx];
-      auto param_out = params_out[idx];
-      auto velocity = velocitys[idx];
-      auto velocity_out = velocitys_out[idx];
-
-      auto grad = grads[idx];
-      Tensor regularized_grad;
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
-        const auto& runner1 = NpuOpRunner("Muls",
-                                          {*param},
-                                          {regularized_grad},
-                                          {{"value", regularization_coeff}});
-        runner1.Run(dev_ctx.stream());
-        const auto& runner2 = NpuOpRunner(
-            "Add", {regularized_grad, *grad}, {regularized_grad}, {});
-        runner2.Run(dev_ctx.stream());
-      } else {
-        regularized_grad.ShareDataWith(*grad);
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner("ApplyMomentum",
-                                       {*param_out,
-                                        *velocity_out,
-                                        *learning_rate,
-                                        regularized_grad,
-                                        mu_tensor},
-                                       {*param_out},
-                                       {{"use_nesterov", use_nesterov}});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(merged_momentum,
-                       ops::NPUMergedMomentumOpKernel<float>,
-                       ops::NPUMergedMomentumOpKernel<plat::float16>);
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-
-    std::string regularization_method =
-        ctx.Attr<std::string>("regularization_method");
-    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
-    phi::RegularizationType regularization_flag{
-        phi::RegularizationType::kNONE};  // disable regularization
-    if (regularization_method == "l2_decay") {
-      regularization_flag = phi::RegularizationType::kL2DECAY;
-    }
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto param = ctx.Input<phi::DenseTensor>("Param");
-    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
-
-    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto grad = ctx.Input<phi::DenseTensor>("Grad");
-      Tensor mu_tensor;
-      mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&mu_tensor, mu);
-
-      Tensor regularized_grad;
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
-        const auto& runner1 = NpuOpRunner("Muls",
-                                          {*param},
-                                          {regularized_grad},
-                                          {{"value", regularization_coeff}});
-        runner1.Run(dev_ctx.stream());
-        const auto& runner2 = NpuOpRunner(
-            "Add", {regularized_grad, *grad}, {regularized_grad}, {});
-        runner2.Run(dev_ctx.stream());
-      } else {
-        regularized_grad.ShareDataWith(*grad);
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner("ApplyMomentum",
-                                       {*param_out,
-                                        *velocity_out,
-                                        *learning_rate,
-                                        regularized_grad,
-                                        mu_tensor},
-                                       {*param_out},
-                                       {{"use_nesterov", use_nesterov}});
-      runner.Run(dev_ctx.stream());
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(
-          false,
-          true,
-          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in MomentumOp. Excepted LodTensor "
-                            "or SelectedRows, But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(momentum,
-                       ops::NPUMomentumOpKernel<float>,
-                       ops::NPUMomentumOpKernel<plat::float16>);
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RMSPROPNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *grad_var = ctx.InputVar("Grad");
-    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto *moment_out = ctx.Output<phi::DenseTensor>("MomentOut");
-    auto *mean_square_out = ctx.Output<phi::DenseTensor>("MeanSquareOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
-    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-    auto *p_tensor = ctx.Input<phi::DenseTensor>("Param");
-    auto *ms_tensor = ctx.Input<phi::DenseTensor>("MeanSquare");
-    auto *lr_tensor = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto *mom_tensor = ctx.Input<phi::DenseTensor>("Moment");
-    bool centered = ctx.Attr<bool>("centered");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto *grad_tensor = ctx.Input<phi::DenseTensor>("Grad");
-      if (centered) {
-        framework::NPUAttributeMap attr_input = {{"use_locking", false}};
-        const phi::DenseTensor *rho_tensor = nullptr;
-        const phi::DenseTensor *momentum_tensor = nullptr;
-        const phi::DenseTensor *epsilon_tensor = nullptr;
-        phi::DenseTensor rho_tmp(phi::DataType::FLOAT32);
-        rho_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&rho_tmp, rho);
-        rho_tensor = &rho_tmp;
-        phi::DenseTensor momentum_tmp(phi::DataType::FLOAT32);
-        momentum_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&momentum_tmp, momentum);
-        momentum_tensor = &momentum_tmp;
-        phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-        epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
-        epsilon_tensor = &epsilon_tmp;
-        auto *mg_tensor = ctx.Input<phi::DenseTensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<phi::DenseTensor>("MeanGradOut");
-        mean_grad_out->mutable_data<T>(ctx.GetPlace());
-        const auto &runner_applycenterrmsprop = NpuOpRunner(
-            std::string("ApplyCenteredRMSPropD"),
-            {*p_tensor,
-             *mg_tensor,
-             *ms_tensor,
-             *mom_tensor,
-             *lr_tensor,
-             *rho_tensor,
-             *momentum_tensor,
-             *epsilon_tensor,
-             *grad_tensor},
-            {*param_out, *mean_grad_out, *mean_square_out, *moment_out},
-            {attr_input});
-        runner_applycenterrmsprop.Run(stream);
-      } else {
-        framework::NPUAttributeMap attr_input = {
-            {"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}};
-        const auto &runner_applyrmsprop = NpuOpRunner(
-            std::string("ApplyRMSPropD"),
-            {*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor},
-            {*param_out, *mean_square_out, *moment_out},
-            {attr_input});
-        runner_applyrmsprop.Run(stream);
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in RmspropOp. Excepted LodTensor, "
-                            "But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    rmsprop, ops::RMSPROPNPUKernel<paddle::platform::NPUDeviceContext, float>)
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SGDNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto* param_var = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.Input<phi::DenseTensor>("Grad");
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("ApplyGradientDescent",
-                                     {*param_var, *learning_rate, *grad_var},
-                                     {*param_out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
-    // if param and param_out is not same, we need to do copy.
-    if (param_out->data<T>() != param_var->data<T>()) {
-      framework::TensorCopy(
-          *param_var,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    sgd,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ReduceAnyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // set attr
-    NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
-
-    const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(reduce_any);
-USE_OP_DEVICE_KERNEL(reduce_any, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  std::vector<bool> init_x = {true, false, false, false};
-  f::TensorFromVector<bool>(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({2}));
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  std::vector<int> axes;
-  f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
-  auto op = f::OpRegistry::CreateOp(
-      "reduce_any", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  std::vector<bool> out_vec;
-  f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  std::vector<bool> expected_vec = {true};
-  EXPECT_EQ(out_vec.size(), expected_vec.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], expected_vec[i]);
-  }
-}
-
-TEST(reduce_any, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<bool>(&scope, *ctx);
-}
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceMaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                        const std::vector<phi::DenseTensor>& outputs,
-                        const NPUAttributeMap& attrs,
-                        const platform::NPUDeviceContext& dev_ctx) {
-        const auto& runner =
-            NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs);
-        runner.Run(dev_ctx.stream());
-      };
-
-      NpuOpRunner::TypeAdapter({*x},
-                               {cast_out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Input<phi::DenseTensor>("Out");
-    auto* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto reduce_dims = context.Attr<std::vector<int>>("dim");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    int in_dtype = context.Attr<int>("in_dtype");
-
-    PADDLE_ENFORCE_EQ(
-        in_dtype == -1,
-        true,
-        platform::errors::InvalidArgument(
-            "NPU only support in_dtype == -1 in reduce_max_grad op."));
-
-    auto* x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    x_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::NPUDeviceContext>();
-    auto place = context.GetPlace();
-    auto stream = dev_ctx.stream();
-
-    // broadcast
-    auto x_dims_vec = phi::vectorize(x->dims());
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-
-    phi::DenseTensor tmp_out, tmp_out_grad;
-    auto tmp_out_dims_vec = x_dims_vec;
-    for (auto d : reduce_dims) {
-      if (d < 0) {
-        d += x_dims_vec.size();
-      }
-      tmp_out_dims_vec[d] = 1;
-    }
-
-    tmp_out.ShareDataWith(*out);
-    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
-    tmp_out_grad.ShareDataWith(*out_grad);
-    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
-
-    phi::DenseTensor transformed_out(x->type());
-    transformed_out.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out.mutable_data<T>(place);
-    NpuOpRunner r_brd_out;
-    r_brd_out.SetType("BroadcastTo")
-        .AddInput(tmp_out)
-        .AddInput(std::move(x_dims_vec))
-        .AddOutput(transformed_out)
-        .Run(stream);
-    phi::DenseTensor transformed_out_grad(x->type());
-    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out_grad.mutable_data<T>(place);
-    NpuOpRunner r_brd_out_grad;
-    r_brd_out_grad.SetType("BroadcastTo")
-        .AddInput(tmp_out_grad)
-        .AddInput(std::move(x_dims_vec))
-        .AddOutput(transformed_out_grad)
-        .Run(stream);
-
-    // compare
-    phi::DenseTensor equal_cond;
-    equal_cond.mutable_data<bool>(x_grad->dims(), place);
-    const auto& r_equal =
-        NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
-    r_equal.Run(stream);
-
-    // select
-    phi::DenseTensor t_zero;
-    t_zero.mutable_data<T>(x_grad->dims(), place);
-    FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
-    t_zero.Resize(x_grad->dims());
-
-    const auto& r_sel = NpuOpRunner(
-        "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {});
-    r_sel.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_max,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
-REGISTER_OP_NPU_KERNEL(
-    reduce_max_grad,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int>);
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUReduceMeanOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-
-    auto input_dims = input->dims();
-    if (reduce_all) {
-      dims.clear();
-      for (int i = 0; i < input_dims.size(); i++) {
-        dims.push_back(static_cast<int>(i));
-      }
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.SetType("ReduceMean")
-        .AddInput(*input)
-        .AddInput(std::move(dims))
-        .AddOutput(*output)
-        .AddAttrs({{"keep_dims", keep_dim}})
-        .Run(stream);
-  }
-};
-
-template <typename T>
-class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
-    auto input_dims = input->dims();
-
-    int reduce_numel = 1;
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (int d = 0; d < input_dims.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    for (auto& d : reduce_dims) {
-      if (d < 0) {
-        d = d + input_dims.size();
-      }
-      reduce_numel *= input_dims[d];
-    }
-
-    phi::DenseTensor tensor_value(input_grad->dtype());
-    tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(
-        &tensor_value, static_cast<T>(1.0f / static_cast<T>(reduce_numel)));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    runner.SetType("Fill")
-        .AddInput(phi::vectorize(input_dims))
-        .AddInput(tensor_value)
-        .AddOutput(*input_grad)
-        .Run(stream);
-
-    phi::DenseTensor transformed_input_grad, transformed_out_grad;
-    phi::DenseTensor tmp_output_grad;
-    auto tmp_output_dims = input_dims;
-    for (auto d : reduce_dims) {
-      tmp_output_dims[d] = 1;
-    }
-    tmp_output_grad.ShareDataWith(*output_grad);
-    tmp_output_grad.Resize(tmp_output_dims);
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    NpuElementWiseOpBroadcast<T>(dev_ctx,
-                                 input_grad,
-                                 &tmp_output_grad,
-                                 0,
-                                 &transformed_input_grad,
-                                 &transformed_out_grad);
-    const auto& runner2 =
-        NpuOpRunner("Mul",
-                    {transformed_input_grad, transformed_out_grad},
-                    {*input_grad},
-                    {});
-    runner2.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel<float>);
-REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel<float>);
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceMinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    if (x->dtype() == phi::DataType::INT64) {
-      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                        const std::vector<phi::DenseTensor>& outputs,
-                        const NPUAttributeMap& attrs,
-                        const platform::NPUDeviceContext& dev_ctx) {
-        const auto& runner =
-            NpuOpRunner("ReduceMinD", {inputs[0]}, {outputs[0]}, attrs);
-        runner.Run(dev_ctx.stream());
-      };
-
-      NpuOpRunner::TypeAdapter({*x},
-                               {cast_out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceMinD", {*x}, {cast_out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-
-    if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_min,
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, plat::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int>);
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceProdNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input);
-    runner.Run(stream);
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_prod,
-    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceSumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    bool keep_dims = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // special case
-    if (x->dims().size() == 1 && keep_dims == false) {
-      keep_dims = true;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor cast_x;
-    phi::DenseTensor cast_out;
-    // NOTE: ReduceSumD only supports fp32 and fp16
-    if (framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP32 &&
-        framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP16) {
-      cast_x.Resize(x->dims());
-      cast_x.mutable_data<float>(ctx.GetPlace());
-      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
-      const auto& runner_cast = NpuOpRunner(
-          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-
-      cast_out.Resize(out->dims());
-      cast_out.mutable_data<float>(ctx.GetPlace());
-    } else {
-      cast_x.ShareDataWith(*x);
-      cast_out.ShareDataWith(*out);
-    }
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      const auto& runner =
-          NpuOpRunner("ReduceSumD",
-                      {cast_x},
-                      {cast_out},
-                      {{"axes", dim_vec}, {"keep_dims", keep_dims}});
-      runner.Run(stream);
-
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceSumD",
-                      {cast_x},
-                      {cast_out},
-                      {{"axes", dims}, {"keep_dims", keep_dims}});
-      runner.Run(stream);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP32 &&
-        framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP16) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(out->dtype()));
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    bool keep_dims = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (keep_dims || reduce_all) {
-      const auto& runner = NpuOpRunner("BroadcastToD",
-                                       {*out_grad},
-                                       {*x_grad},
-                                       {{"shape", phi::vectorize(x->dims())}});
-      runner.Run(stream);
-    } else {
-      framework::DDim out_dims;
-      out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
-          dims, out_grad->dims());
-
-      phi::DenseTensor out_grad_tmp(out_grad->type());
-      out_grad_tmp.Resize(out_dims);
-      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopy(
-          *out_grad,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &out_grad_tmp);
-      out_grad_tmp.Resize(out_dims);
-
-      const auto& runner = NpuOpRunner("BroadcastToD",
-                                       {out_grad_tmp},
-                                       {*x_grad},
-                                       {{"shape", phi::vectorize(x->dims())}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    reduce_sum,
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    reduce_sum_grad,
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SequenceMaskNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    int maxlen = ctx.Attr<int>("maxlen");
-
-    if (ctx.HasInput("MaxLenTensor")) {
-      auto max_len_tensor = ctx.Input<phi::DenseTensor>("MaxLenTensor");
-      PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
-                              platform::errors::InvalidArgument(
-                                  "Input(MaxLenTensor) should not be NULL."
-                                  "But received Input(MaxLenTensor) is NULL"));
-      phi::DenseTensor temp;
-      paddle::framework::TensorCopySync(
-          *max_len_tensor, platform::CPUPlace(), &temp);
-      maxlen = *temp.data<int32_t>();
-      PADDLE_ENFORCE_GT(
-          maxlen,
-          0,
-          platform::errors::InvalidArgument(
-              "Input(MaxLenTensor) value should be greater than 0. But "
-              "received Input(MaxLenTensor) value = %d.",
-              maxlen));
-    }
-
-    if (maxlen < 0) {
-      auto x_numel = x->numel();
-      if (x_numel == 0) {
-        maxlen = 0;
-      } else {
-        std::vector<T> x_vec;
-        framework::TensorToVector(*x, dev_ctx, &x_vec);
-        auto x_data = x_vec.data();
-        maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
-      }
-    }
-    auto y_dim = phi::vectorize<int>(x->dims());
-    y_dim.push_back(maxlen);
-
-    phi::DenseTensor cast_x;
-    cast_x.mutable_data<int32_t>(x->dims(), ctx.GetPlace());
-    const auto& cast1_runner = NpuOpRunner(
-        "Cast",
-        {*x},
-        {cast_x},
-        {{"dst_type",
-          ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}});
-    cast1_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor tmp;
-    tmp.mutable_data<int32_t>(phi::make_ddim({maxlen}), ctx.GetPlace());
-    NpuOpRunner range_runner;
-    range_runner.SetType("Range");
-    range_runner.AddInput(std::vector<int32_t>({0}));
-    range_runner.AddInput(std::vector<int32_t>({maxlen}));
-    range_runner.AddInput(std::vector<int32_t>({1}));
-    range_runner.AddOutput(tmp);
-    range_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor expand_tmp;
-    expand_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& expand_runner =
-        NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}});
-    expand_runner.Run(dev_ctx.stream());
-
-    auto x_dims = phi::vectorize<int>(x->dims());
-    x_dims.push_back(1);
-    cast_x.Resize(phi::make_ddim({x_dims}));
-    phi::DenseTensor x_tmp;
-    x_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& tile_runner =
-        NpuOpRunner("TileWithAxis",
-                    {cast_x},
-                    {x_tmp},
-                    {{"axis", x->dims().size()}, {"tiles", maxlen}});
-    tile_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor y_tmp;
-    y_tmp.mutable_data<uint8_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& less_runner =
-        NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {});
-    less_runner.Run(dev_ctx.stream());
-
-    y->Resize(phi::make_ddim(y_dim));
-    auto out_dtype = static_cast<framework::proto::VarType::Type>(
-        ctx.Attr<int>("out_dtype"));
-    if (out_dtype == framework::proto::VarType::INT32) {
-      y->mutable_data<int32_t>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::INT64) {
-      y->mutable_data<int64_t>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::FP32) {
-      y->mutable_data<float>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::FP64) {
-      y->mutable_data<double>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::BOOL) {
-      y->mutable_data<bool>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::UINT8) {
-      y->mutable_data<uint8_t>(ctx.GetPlace());
-    } else {
-      PADDLE_ENFORCE(false,
-                     platform::errors::InvalidArgument(
-                         "out_dtype only supporing int32, int64, fp32, fp64, "
-                         "bool, uint8, but receive out_dtype is %d",
-                         out_dtype));
-    }
-
-    const auto& cast2_runner = NpuOpRunner(
-        "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}});
-    cast2_runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    sequence_mask,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int32_t>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, double>);