delete paddle/fluid/operators/*_npu.* (#52678)

* delete paddle/fluid/operators/*_npu.* * try pass CI * try pass CI

delete paddle/fluid/operators/_npu. (#52678)
* delete paddle/fluid/operators/*_npu.* * try pass CI * try pass CI
a7707efb · jjyaoao · GitHub · 2b0fffc2 · a7707efb · 2b0fffc2
111 changed file
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -96,7 +96,7 @@ register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 genera
        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})

 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
-op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc run_program_op_npu.cc DEPS executor_cache ${OP_HEADER_DEPS})
+op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 target_link_libraries(run_program_op cuda_graph_with_memory_pool)
 op_library(quantize_linear_op DEPS phi)
 op_library(save_combine_op DEPS string_array phi)

--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AbsNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Abs",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AbsGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    abs,
-    ops::AbsNPUKernel<plat::NPUDeviceContext, float>,
-    ops::AbsNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    abs_grad,
-    ops::AbsGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::AbsGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void TranposeNPU(const framework::ExecutionContext& ctx,
-                        const aclrtStream& stream,
-                        std::vector<int64_t>* perm,
-                        const phi::DenseTensor& in,
-                        phi::DenseTensor* out) {
-  out->mutable_data<T>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Transpose")
-      .AddInput(in)
-      .AddInput(std::move(*perm))
-      .AddOutput(*out)
-      .Run(stream);
-}
-
-static void CastToInt64(const framework::ExecutionContext& ctx,
-                        const aclrtStream& stream,
-                        const phi::DenseTensor& in,
-                        phi::DenseTensor* out) {
-  out->mutable_data<int64_t>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_INT64)
-      .Run(stream);
-}
-
-static void CastToFP32(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<float>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT)
-      .Run(stream);
-}
-
-template <typename T>
-class ArgsortNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    framework::NPUAttributeMap attr = {{"axis", -1},
-                                       {"descending", descending}};
-
-    phi::DenseTensor indices_tmp(phi::DataType::INT32);
-    indices_tmp.Resize(indices->dims());
-
-    if (framework::TransToProtoVarType(input->dtype()) ==
-        framework::proto::VarType::INT64) {
-      phi::DenseTensor input_fp32(phi::DataType::FLOAT32);
-      input_fp32.Resize(input->dims());
-      CastToFP32(ctx, stream, *input, &input_fp32);
-
-      phi::DenseTensor output_fp32(phi::DataType::FLOAT32);
-      output_fp32.Resize(output->dims());
-
-      if (axis == -1 || axis + 1 == in_dims.size()) {
-        output_fp32.mutable_data<float>(ctx.GetPlace());
-        indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("Sort", {input_fp32}, {output_fp32, indices_tmp}, attr);
-        runner.Run(stream);
-
-        CastToInt64(ctx, stream, output_fp32, output);
-      } else {
-        std::vector<int64_t> perm;
-        for (int64_t i = 0; i < in_dims.size(); i++) {
-          perm.emplace_back(i);
-        }
-        std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-        std::vector<int64_t> shape;
-        for (size_t i = 0; i < perm.size(); i++) {
-          shape.emplace_back(in_dims[perm[i]]);
-        }
-        auto trans_dims = phi::make_ddim(shape);
-
-        phi::DenseTensor trans_input(input_fp32.type());
-        trans_input.Resize(trans_dims);
-        TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);
-
-        phi::DenseTensor trans_output(input_fp32.type());
-        phi::DenseTensor trans_indices(phi::DataType::INT32);
-        trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
-        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-
-        const auto& runner = NpuOpRunner(
-            "Sort", {trans_input}, {trans_output, trans_indices}, attr);
-        runner.Run(stream);
-
-        TranposeNPU<float>(ctx, stream, &perm, trans_output, &output_fp32);
-        TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-
-        CastToInt64(ctx, stream, output_fp32, output);
-      }
-    } else {
-      if (axis == -1 || axis + 1 == in_dims.size()) {
-        output->mutable_data<T>(ctx.GetPlace());
-        indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
-        runner.Run(stream);
-      } else {
-        std::vector<int64_t> perm;
-        for (int64_t i = 0; i < in_dims.size(); i++) {
-          perm.emplace_back(i);
-        }
-        std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-        std::vector<int64_t> shape;
-        for (size_t i = 0; i < perm.size(); i++) {
-          shape.emplace_back(in_dims[perm[i]]);
-        }
-        auto trans_dims = phi::make_ddim(shape);
-
-        phi::DenseTensor trans_input(input->type());
-        trans_input.Resize(trans_dims);
-        TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
-
-        phi::DenseTensor trans_output(input->type());
-        phi::DenseTensor trans_indices(phi::DataType::INT32);
-        trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-
-        const auto& runner = NpuOpRunner(
-            "Sort", {trans_input}, {trans_output, trans_indices}, attr);
-        runner.Run(stream);
-
-        TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
-        TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-      }
-    }
-
-    CastToInt64(ctx, stream, indices_tmp, indices);
-  }
-};
-
-template <typename T, typename Type>
-static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          const aclrtStream& stream,
-                          const framework::DDim in_dims,
-                          const phi::DenseTensor& input,
-                          const phi::DenseTensor& indices,
-                          phi::DenseTensor* t_out) {
-  const int64_t input_height =
-      phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-  const int64_t input_width = in_dims[in_dims.size() - 1];
-
-  phi::DenseTensor input_tmp;
-  input_tmp.ShareDataWith(input);
-  input_tmp.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
-
-  phi::DenseTensor indices_tmp;
-  indices_tmp.ShareDataWith(indices);
-  indices_tmp.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
-
-  std::vector<int64_t> indexs_value;
-  for (Type i = 0; i < input_height; i++) {
-    indexs_value.push_back(i * input_width);
-  }
-  phi::DenseTensor indexs_tmp(indices.type());
-  framework::TensorFromVector<int64_t>(
-      indexs_value, ctx.device_context(), &indexs_tmp);
-  indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));
-
-  phi::DenseTensor indices_index(indices.type());
-  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
-  const auto& runner_add =
-      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
-  runner_add.Run(stream);
-
-  indices_index.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
-
-  t_out->mutable_data<T>(ctx.GetPlace());
-  phi::DenseTensor out_tmp(t_out->type());
-  out_tmp.ShareDataWith(*t_out);
-
-  const auto& runner = NpuOpRunner("TensorScatterUpdate",
-                                   {input_tmp, indices_index, input_tmp},
-                                   {out_tmp},
-                                   {});
-  runner.Run(stream);
-}
-
-template <typename T>
-class ArgsortGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (dO->numel() == 0) return;
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
-    } else {
-      std::vector<int64_t> perm;
-      for (int64_t i = 0; i < in_dims.size(); i++) {
-        perm.emplace_back(i);
-      }
-      std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-      std::vector<int64_t> shape;
-      for (size_t i = 0; i < perm.size(); i++) {
-        shape.emplace_back(in_dims[perm[i]]);
-      }
-      auto trans_dims = phi::make_ddim(shape);
-
-      phi::DenseTensor trans_dout(dO->type());
-      phi::DenseTensor trans_ids(indices->type());
-      trans_dout.Resize(trans_dims);
-      trans_ids.Resize(trans_dims);
-
-      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
-      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
-
-      phi::DenseTensor trans_dx(dO->type());
-      trans_dx.Resize(trans_dims);
-      FullAssignNPU<T, int64_t>(
-          ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);
-
-      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(argsort,
-                       ops::ArgsortNPUKernel<float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ArgsortNPUKernel<int64_t>,
-#endif
-                       ops::ArgsortNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(argsort_grad,
-                       ops::ArgsortGradNPUKernel<float>,
-                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-class Variable;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class AssignNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    assign,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(assign);
-USE_OP_DEVICE_KERNEL(assign, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  init.push_back(static_cast<T>(1.0));
-  init.push_back(static_cast<T>(2.0));
-  init.push_back(static_cast<T>(3.0));
-  init.push_back(static_cast<T>(4.0));
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({4});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  auto op =
-      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
-  EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
-  EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
-  EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
-  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
-}
-
-TEST(assign, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "assign");
-}
--- a/paddle/fluid/operators/assign_value_op_npu.cc
+++ b/paddle/fluid/operators/assign_value_op_npu.cc
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/assign_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(assign_value,
-                       ops::AssignValueKernel<bool>,
-                       ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<int64_t>,
-                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class NPUBatchNormOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-
-    bool test_mode = is_test && (!trainable_stats);
-    bool training = !test_mode && !use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        (x_dims.size() == 4UL || x_dims.size() == 3UL),
-        true,
-        platform::errors::InvalidArgument(
-            "The input tensor X's dimension must equal to 3 or 4. "
-            " But got X's shape = [%s], X's dimension = [%d].",
-            x_dims.to_str(),
-            x_dims.size()));
-
-    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto x_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
-    auto y_tesnor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(y->dims(), dev_ctx);
-    x_tensor.ShareDataWith(*x);
-    y_tesnor.ShareDataWith(*y);
-    if (data_layout == DataLayout::kNHWC) {
-      x_tensor.set_layout(DataLayout::kNHWC);
-      y_tesnor.set_layout(DataLayout::kNHWC);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (!training) {
-      const auto &runner_infer =
-          NpuOpRunner("BNInfer",
-                      {x_tensor, *scale, *bias, *running_mean, *running_var},
-                      {y_tesnor},
-                      {{"epsilon", epsilon}});
-      runner_infer.Run(stream);
-    } else {
-      auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-      auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-      auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-      auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-      mean_out->mutable_data<float>(ctx.GetPlace());
-      variance_out->mutable_data<float>(ctx.GetPlace());
-      saved_mean->mutable_data<float>(ctx.GetPlace());
-      saved_variance->mutable_data<float>(ctx.GetPlace());
-
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        phi::DenseTensor mom_cpu;
-        paddle::framework::TensorCopySync(
-            *mom_tensor, platform::CPUPlace(), &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      phi::DenseTensor sum, square_sum;
-      sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
-      square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
-
-      // BNTrainingReduce ONLY support rank = 4
-      if (x->dims().size() == 3) {
-        auto x_shape_vec = phi::vectorize(x->dims());
-        if (data_layout == DataLayout::kNCHW) {
-          x_shape_vec.push_back(1);  // expand NCL -> NCL1
-        } else {
-          x_shape_vec.insert(x_shape_vec.begin() + 2, 1);  // expand NLC -> NL1C
-        }
-        auto x_new_shape = phi::make_ddim(x_shape_vec);
-        x_tensor.Resize(x_new_shape);
-        x_tensor.Resize(x_new_shape);
-      }
-      const auto &runner_reduce = NpuOpRunner("BNTrainingReduce",
-                                              {x_tensor},
-                                              {sum, square_sum},
-                                              {{"epsilon", epsilon}});
-      runner_reduce.Run(stream);
-
-      const auto &runner_update = NpuOpRunner(
-          "BNTrainingUpdate",
-          {x_tensor,
-           sum,
-           square_sum,
-           *scale,
-           *bias,
-           *running_mean,
-           *running_var},
-          {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
-          {{"factor", momentum}, {"epsilon", epsilon}});
-      runner_update.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance =
-        ctx.Input<phi::DenseTensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    use_global_stats = is_test || use_global_stats;
-
-    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto x_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
-    auto dy_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_y->dims(), dev_ctx);
-    x_tensor.ShareDataWith(*x);
-    dy_tensor.ShareDataWith(*d_y);
-    if (data_layout == DataLayout::kNHWC) {
-      x_tensor.set_layout(DataLayout::kNHWC);
-      dy_tensor.set_layout(DataLayout::kNHWC);
-    }
-
-    auto scale_grad_tmp =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(scale->dims(), dev_ctx);
-    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(bias->dims(), dev_ctx);
-    if (d_scale == nullptr) {
-      d_scale = &scale_grad_tmp;
-    }
-    if (d_bias == nullptr) {
-      d_bias = &bias_grad_tmp;
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<float>(ctx.GetPlace());
-      d_bias->mutable_data<float>(ctx.GetPlace());
-      if (use_global_stats) {
-        const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-        const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
-        const auto &runner_update =
-            NpuOpRunner("BNTrainingUpdateGrad",
-                        {dy_tensor, x_tensor, *running_mean, *running_variance},
-                        {*d_scale, *d_bias},
-                        {{"epsilon", epsilon}});
-        runner_update.Run(stream);
-      } else {
-        const auto &runner_update =
-            NpuOpRunner("BNTrainingUpdateGrad",
-                        {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance},
-                        {*d_scale, *d_bias},
-                        {{"epsilon", epsilon}});
-        runner_update.Run(stream);
-      }
-    }
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_x->dims(), dev_ctx);
-      dx_tensor.ShareDataWith(*d_x);
-      if (data_layout == DataLayout::kNHWC) {
-        dx_tensor.set_layout(DataLayout::kNHWC);
-      }
-      if (use_global_stats) {
-        if (x->dims().size() == 3) {
-          // BNInferGrad only support x rank = 4,
-          auto x_shape_vec = phi::vectorize(d_x->dims());
-          if (data_layout == DataLayout::kNCHW) {
-            x_shape_vec.push_back(1);  // expand NCL -> NCL1
-          } else {
-            x_shape_vec.insert(x_shape_vec.begin() + 2,
-                               1);  // expand NLC -> NL1C
-          }
-          auto x_new_shape = phi::make_ddim(x_shape_vec);
-          dx_tensor.Resize(x_new_shape);
-          dy_tensor.Resize(x_new_shape);
-        }
-        const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-        const auto &runner_infer =
-            NpuOpRunner("BNInferGrad",
-                        {dy_tensor, *scale, *running_var},
-                        {dx_tensor},
-                        {{"epsilon", epsilon}});
-        runner_infer.Run(stream);
-      } else {
-        const auto &runner_reduce = NpuOpRunner("BNTrainingReduceGrad",
-                                                {dy_tensor,
-                                                 x_tensor,
-                                                 *d_scale,
-                                                 *d_bias,
-                                                 *scale,
-                                                 *saved_mean,
-                                                 *saved_inv_variance},
-                                                {dx_tensor},
-                                                {{"epsilon", epsilon}});
-        runner_reduce.Run(stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(batch_norm,
-                       ops::NPUBatchNormOpKernel<float>,
-                       ops::NPUBatchNormOpKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(batch_norm_grad,
-                       ops::NPUBatchNormGradOpKernel<float>,
-                       ops::NPUBatchNormGradOpKernel<plat::float16>);
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class BCELossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("BinaryCrossEntropy",
-                    {*x, *labels},
-                    {*out},
-                    {{"reduction", static_cast<std::string>("none")}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("BinaryCrossEntropyGrad",
-                    {*x, *labels, *dout},
-                    {*dx},
-                    {{"reduction", static_cast<std::string>("none")}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    bce_loss,
-    ops::BCELossNPUKernel<plat::NPUDeviceContext, float>,
-    ops::BCELossNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::BCELossGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/beam_search_op_npu.cc
+++ b/paddle/fluid/operators/beam_search_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
-
-namespace ops = paddle::operators;
-using NPUCtx = paddle::platform::NPUDeviceContext;
-
-REGISTER_OP_NPU_KERNEL(beam_search,
-                       ops::BeamSearchOpKernel<float, NPUCtx>,
-                       ops::BeamSearchOpKernel<double, NPUCtx>,
-                       ops::BeamSearchOpKernel<int, NPUCtx>,
-                       ops::BeamSearchOpKernel<int64_t, NPUCtx>);
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-static std::map<framework::proto::VarType::Type, aclDataType>
-    DTYPE_2_ACL_DTYPE = {
-        {framework::proto::VarType::BOOL, ACL_BOOL},
-        {framework::proto::VarType::INT16, ACL_INT16},
-        {framework::proto::VarType::INT32, ACL_INT32},
-        {framework::proto::VarType::INT64, ACL_INT64},
-        {framework::proto::VarType::FP16, ACL_FLOAT16},
-        {framework::proto::VarType::FP32, ACL_FLOAT},
-        {framework::proto::VarType::FP64, ACL_DOUBLE},
-};
-
-template <typename DeviceContext, typename T>
-class CastNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    int dtype = ctx.Attr<int>("out_dtype");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-
-    if (framework::TransToProtoVarType(x->dtype()) == dtype) {
-      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
-      // add special case here.
-      VLOG(4) << "cast to same dtype:" << dtype;
-      out->mutable_data(place, x->type());
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          out);
-      return;
-    }
-
-    auto iter = DTYPE_2_ACL_DTYPE.find(
-        static_cast<framework::proto::VarType::Type>(dtype));
-    int aclDtype = iter->second;
-
-    if (dtype == framework::proto::VarType::FP32) {
-      out->mutable_data<float>(place);
-    } else if (dtype == framework::proto::VarType::FP16) {
-      out->mutable_data<paddle::platform::float16>(place);
-    } else if (dtype == framework::proto::VarType::INT16) {
-      out->mutable_data<int16_t>(place);
-    } else if (dtype == framework::proto::VarType::INT32) {
-      out->mutable_data<int32_t>(place);
-    } else if (dtype == framework::proto::VarType::INT64) {
-      out->mutable_data<int64_t>(place);
-    } else if (dtype == framework::proto::VarType::FP64) {
-      out->mutable_data<double>(place);
-    } else if (dtype == framework::proto::VarType::BOOL) {
-      out->mutable_data<bool>(place);
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
-    runner.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    cast,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUClipByNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<float>("max_norm");
-    auto in_var = context.InputVar("X");
-
-    if (!(in_var->IsType<phi::DenseTensor>())) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid input variable type, only support LodTensor"
-          "type, but got type is %s.",
-          framework::ToTypeName(in_var->Type())));
-    }
-
-    auto place = context.GetPlace();
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(place);
-
-    PADDLE_ENFORCE_NOT_NULL(input,
-                            platform::errors::InvalidArgument(
-                                "Input(X) of ClipByNormOp should not be null. "
-                                "Please check if it is created correctly."));
-
-    phi::DenseTensor square_sum(input->type());
-    square_sum.mutable_data<T>(framework::DDim({1}), place);
-    const auto& x_dims = input->dims();
-    std::vector<int> axis;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      axis.push_back(i);
-    }
-    const auto& square_sum_runner =
-        NpuOpRunner("SquareSumV1",
-                    {*input},
-                    {square_sum},
-                    {{"axis", axis}, {"keep_dims", false}});
-    square_sum_runner.Run(stream);
-
-    phi::DenseTensor x_norm(input->type());
-    x_norm.mutable_data<T>(framework::DDim({1}), place);
-    const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
-    x_norm_runner.Run(stream);
-
-    phi::DenseTensor x_norm_t;
-    framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
-    auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
-    if (x_norm_v <= max_norm) {
-      framework::TensorCopy(*input, place, dev_ctx, output);
-    } else {
-      auto epsilon = x_norm_v <= static_cast<float>(1e-30)
-                         ? static_cast<float>(1e-6)
-                         : static_cast<float>(0);
-      float scaling = max_norm / (x_norm_v + epsilon);
-      const auto& muls_runner =
-          NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}});
-      muls_runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    clip_by_norm,
-    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext,
-                             plat::float16>);
--- a/paddle/fluid/operators/clip_op_npu.cc
+++ b/paddle/fluid/operators/clip_op_npu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ClipNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    phi::DenseTensor min_tensor_temp(x->type());
-    phi::DenseTensor max_tensor_temp(x->type());
-    if (min_tensor == nullptr) {
-      auto min_value = static_cast<T>(ctx.Attr<float>("min"));
-      min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&min_tensor_temp, min_value);
-      min_tensor = &min_tensor_temp;
-    }
-
-    if (max_tensor == nullptr) {
-      auto max_value = static_cast<T>(ctx.Attr<float>("max"));
-      max_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&max_tensor_temp, max_value);
-      max_tensor = &max_tensor_temp;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("ClipByValue", {*x, *min_tensor, *max_tensor}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ClipGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto* min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto* max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    auto min_val = ctx.Attr<float>("min");
-    if (min_tensor) {
-      phi::DenseTensor min_data;
-      framework::TensorCopy(
-          *min_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &min_data);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      min_val = static_cast<float>(min_data.data<T>()[0]);
-    }
-
-    auto max_val = ctx.Attr<float>("max");
-    if (max_tensor) {
-      phi::DenseTensor max_data;
-      framework::TensorCopy(
-          *max_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &max_data);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      max_val = static_cast<float>(max_data.data<T>()[0]);
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("HardtanhGrad",
-                    {*x, *dout},
-                    {*dx},
-                    {{"min_val", min_val}, {"max_val", max_val}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    clip,
-    ops::ClipNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ClipNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    clip_grad,
-    ops::ClipGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ClipGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ b/paddle/fluid/operators/concat_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ConcatNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-    auto axis = ctx.Attr<int>("axis");
-
-    if (ctx.HasInput("AxisTensor")) {
-      PADDLE_THROW(platform::errors::NotFound(
-          "The AxisTensor is not supported on NPU now."));
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    std::vector<phi::DenseTensor> inputs;
-    std::vector<std::string> names;
-    for (size_t i = 0; i < ins.size(); ++i) {
-      if (ins[i] && ins[i]->numel() > 0) {
-        inputs.push_back(*ins[i]);
-        names.push_back("x" + std::to_string(i));
-      } else {
-        continue;
-      }
-    }
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*out},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ConcatGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-
-    auto axis = ctx.Attr<int>("axis");
-
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    int offset = 0;
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    for (size_t j = 0; j < outs.size(); ++j) {
-      // For stop gradient
-      // get output tensor that the name is not kEmptyVarName
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        std::vector<int> offsets;
-        std::vector<int> sizes;
-        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
-          if (dim == axis) {
-            offsets.push_back(offset);
-            sizes.push_back(ins[j]->dims()[dim]);
-          } else {
-            offsets.push_back(0);
-            sizes.push_back(ins[j]->dims()[dim]);
-          }
-        }
-        const auto& runner =
-            NpuOpRunner("SliceD",
-                        {*out_grad},
-                        {*outs[j]},
-                        {{"offsets", offsets}, {"size", sizes}});
-        runner.Run(stream);
-      }
-      if (ins[j]->numel() != 0UL) {
-        offset += ins[j]->dims()[axis];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(concat,
-                       ops::ConcatNPUKernel<float>,
-                       ops::ConcatNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ConcatNPUKernel<int64_t>,
-#endif
-                       ops::ConcatNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(concat_grad,
-                       ops::ConcatGradNPUKernel<float>,
-                       ops::ConcatGradNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ConcatGradNPUKernel<int64_t>,
-#endif
-                       ops::ConcatGradNPUKernel<int>);
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    // construct NPU attr
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    for (auto i = output_padding.size(); i < 4; ++i) {
-      output_padding.insert(output_padding.begin(), 0);
-    }
-    auto output_dim_vec = phi::vectorize(output_tensor.dims());
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv2DTransposeD",
-                                     {input_tensor, *filter},
-                                     {output_tensor},
-                                     {{"input_size", output_dim_vec},
-                                      {"strides", strides},
-                                      {"dilations", dilations},
-                                      {"output_padding", output_padding},
-                                      {"groups", groups},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    // auto out_grad_dims = output_grad->dims();
-    // const int batch_size = static_cast<int>(input->dims()[0]);
-
-    const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
-
-    framework::DDim in_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Conv2DBackpropFilterD",
-                      {output_grad_tensor, input_tensor},
-                      {*filter_grad},
-                      {{"filter_size", phi::vectorize<int>(filter_dims)},
-                       {"strides", strides_vec},
-                       {"pads", paddings},
-                       {"dilations", dilations_vec},
-                       {"groups", groups},
-                       {"data_format", data_format}});
-      runner.Run(stream);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      const auto& runner = NpuOpRunner("Conv2D",
-                                       {output_grad_tensor, *filter},
-                                       {input_grad_tensor},
-                                       {{"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    if (data_format == "NHWC") {
-      data_format = "NDHWC";
-    } else {
-      data_format = "NCDHW";
-    }
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    // construct NPU attr
-    std::vector<int> strides(5, 1);
-    std::vector<int> dilations(5, 1);
-
-    phi::DenseTensor input_tensor, output_tensor, filter_tensor;
-    input_tensor.Resize(input->dims());
-    input_tensor.ShareDataWith(*input);
-    output_tensor.Resize(output->dims());
-    output_tensor.ShareDataWith(*output);
-    filter_tensor.Resize(filter->dims());
-    filter_tensor.ShareDataWith(*filter);
-
-    PADDLE_ENFORCE_EQ(
-        dilation[0],
-        1,
-        platform::errors::InvalidArgument(
-            "dilation[0] must be equal 1, but received %d.", dilation[0]));
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNDHWC);
-      output_tensor.set_layout(DataLayout::kNDHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      strides[3] = stride[2];
-      dilations[2] = dilation[1];
-      dilations[3] = dilation[2];
-    } else {
-      input_tensor.set_layout(DataLayout::kNCDHW);
-      output_tensor.set_layout(DataLayout::kNCDHW);
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      strides[4] = stride[2];
-      dilations[3] = dilation[1];
-      dilations[4] = dilation[2];
-    }
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-
-    auto output_dim_vec = phi::vectorize<int32_t>(output_tensor.dims());
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-
-    NpuOpRunner runner;
-    runner.SetType("Conv3DBackpropInputD")
-        .AddInput(filter_tensor)
-        .AddInput(input_tensor)
-        .AddAttr("input_size", output_dim_vec)
-        .AddAttr("strides", strides)
-        .AddAttr("pads", padding)
-        .AddAttr("dilations", dilations)
-        .AddAttr("groups", groups)
-        .AddAttr("data_format", data_format)
-        .AddOutput(output_tensor);
-    runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(conv2d_transpose,
-                       ops::Conv2DTransposeNPUKernel<float>,
-                       ops::Conv2DTransposeNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad,
-                       ops::Conv2DTransposeGradNPUKernel<float>,
-                       ops::Conv2DTransposeGradNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d_transpose,
-                       ops::Conv3DTransposeNPUKernel<float>,
-                       ops::Conv3DTransposeNPUKernel<plat::float16>);
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/crop_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CropNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    std::vector<int> offset_list;
-    if (ctx.HasInput("Offsets")) {
-      auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
-      paddle::framework::TensorToVector(
-          *offsets_tensor, ctx.device_context(), &offset_list);
-      if (offset_list.empty()) {
-        offset_list.resize(x->dims().size(), 0);
-      }
-    } else {
-      auto res = ctx.Attr<std::vector<int>>("offsets");
-      if (res.empty()) {
-        offset_list.resize(x->dims().size(), 0);
-      } else {
-        offset_list.insert(offset_list.end(), res.begin(), res.end());
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(offset_list.size()),
-        x->dims().size(),
-        platform::errors::InvalidArgument(
-            "The shape (%d) of CropOp's "
-            "'offset' attribute should be equal to the shape of dims "
-            "(%d) of the Input(X).",
-            offset_list.size(),
-            x->dims().size()));
-
-    int axis_int = 0;
-    framework::NPUAttributeMap attr_input = {{"offsets", offset_list},
-                                             {"axis", axis_int}};
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    if (ctx.HasInput("Y")) {
-      auto* shape = ctx.Input<phi::DenseTensor>("Y");
-      PADDLE_ENFORCE_EQ(shape->dims().size(),
-                        x->dims().size(),
-                        platform::errors::InvalidArgument(
-                            "The shape of dims of (%d) of CropOp's "
-                            "Input(shape) should be equal to the shape of dims "
-                            "(%d) of the Input(X).",
-                            shape->dims().size(),
-                            x->dims().size()));
-
-      // shape memory maybe have gc.
-      phi::DenseTensor tmp_shape(*shape);
-      tmp_shape.mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner =
-          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    } else {
-      auto shape_size = ctx.Attr<std::vector<int>>("shape");
-      PADDLE_ENFORCE_EQ(shape_size.size(),
-                        x->dims().size(),
-                        platform::errors::InvalidArgument(
-                            "The shape of dims of (%d) of CropOp's "
-                            "Input(shape) should be equal to the shape of dims "
-                            "(%d) of the Input(X).",
-                            shape_size.size(),
-                            x->dims().size()));
-      phi::DenseTensor tmp_shape(x->dtype());
-      tmp_shape.Resize(phi::make_ddim(shape_size));
-      tmp_shape.mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    crop,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-static void CumsumImp(const phi::DenseTensor& input,
-                      phi::DenseTensor* output,
-                      const framework::NPUAttributeMap& attr_input,
-                      const framework::ExecutionContext& ctx) {
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  if (framework::TransToProtoVarType(input.dtype()) ==
-      framework::proto::VarType::INT64) {
-    phi::DenseTensor tmp_input;
-    tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
-    auto dst_acl_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
-    const auto& cast_runner_1 =
-        NpuOpRunner("Cast",
-                    {input},
-                    {tmp_input},
-                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
-    cast_runner_1.Run(stream);
-
-    phi::DenseTensor tmp_output;
-    tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
-    runner.Run(stream);
-
-    dst_acl_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(output->type()));
-    const auto& cast_runner_2 =
-        NpuOpRunner("Cast",
-                    {tmp_output},
-                    {*output},
-                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
-    cast_runner_2.Run(stream);
-  } else {
-    const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input);
-    runner.Run(stream);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CumSumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool reverse = ctx.Attr<bool>("reverse");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {
-        {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
-
-    bool flatten = ctx.Attr<bool>("flatten");
-    if (flatten) {
-      PADDLE_ENFORCE_EQ(
-          axis,
-          -1,
-          platform::errors::InvalidArgument(
-              "when flatten is true, attr axis must be default %d, but got %d",
-              -1,
-              axis));
-
-      phi::DenseTensor new_x(x->type());
-      new_x.ShareDataWith(*x);
-
-      new_x.Resize(phi::make_ddim({x->numel()}));
-
-      CumsumImp(new_x, out, attr_input, ctx);
-    } else {
-      CumsumImp(*x, out, attr_input, ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    cumsum,
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, int64_t>,
-#endif
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DropoutNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dropout_prob == 1.) {
-      const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out});
-      runner_zeros_out.Run(stream);
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-      const auto& runner_zeros_mask =
-          NpuOpRunner("ZerosLike", {*mask}, {*mask});
-      runner_zeros_mask.Run(stream);
-      return;
-    }
-
-    // only achieve the default `upscale_in_train` method
-    if (!is_test) {
-      phi::DenseTensor tmp_x(x->dtype());
-      phi::DenseTensor tmp_out(out->dtype());
-      tmp_x.ShareDataWith(*x);
-      tmp_out.ShareDataWith(*out);
-      if (x->dims().size() == 1) {
-        // DropOutDoMask will get error result when input
-        // is 1-D. Make it become 2-D.
-        std::vector<int> vec_dim = phi::vectorize<int>(x->dims());
-        tmp_x.Resize(phi::make_ddim({vec_dim[0], 1}));
-        tmp_out.Resize(phi::make_ddim({vec_dim[0], 1}));
-      }
-
-      int seed = 0;
-      int seed2 = 0;
-      float keep_prob = 1. - dropout_prob;
-      if (seed_tensor) {
-        std::vector<int> seed_data;
-        paddle::framework::TensorToVector(
-            *seed_tensor, ctx.device_context(), &seed_data);
-        seed = seed_data[0];
-      } else {
-        seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
-      }
-
-      phi::DenseTensor keep_prob_tensor(x->dtype());
-      keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&keep_prob_tensor,
-                                   static_cast<T>(keep_prob));
-
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-
-      // mask used in `DropOutGenMask` NPU OP is different from
-      // the output `Mask`.
-      phi::DenseTensor npu_mask(phi::DataType::UINT8);
-      uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
-      npu_mask.Resize(phi::make_ddim({length / 8}));
-      npu_mask.mutable_data<uint8_t>(ctx.GetPlace());
-
-      // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
-      // OP must be a scalar with shape[0]. At present, the shape
-      // of the `prob` phi::DenseTensor of this OP is forced to be set to 0
-      // in `npu_op_runner.cc`, which needs to be optimized later.
-      NpuOpRunner runner_gen_mask;
-      runner_gen_mask.SetType("DropOutGenMask")
-          .AddInput(phi::vectorize(tmp_out.dims()))
-          .AddInput(keep_prob_tensor)
-          .AddOutput(npu_mask)
-          .AddAttr("seed", seed)
-          .AddAttr("seed2", seed2);
-      runner_gen_mask.Run(stream);
-
-      NpuOpRunner runner_dropout;
-      runner_dropout.SetType("DropOutDoMask")
-          .AddInput(tmp_x)
-          .AddInput(npu_mask)
-          .AddInput(keep_prob_tensor)
-          .AddOutput(tmp_out);
-      runner_dropout.Run(stream);
-
-      // cast `out` from float/float16 to bool
-      phi::DenseTensor cast_mask(phi::DataType::BOOL);
-      cast_mask.Resize(mask->dims());
-      cast_mask.mutable_data<bool>(ctx.GetPlace());
-      auto dst_dtype_bool =
-          ConvertToNpuDtype(framework::TransToProtoVarType(cast_mask.dtype()));
-      const auto& runner_cast_mask_bool =
-          NpuOpRunner("Cast",
-                      {*out},
-                      {cast_mask},
-                      {{"dst_type", static_cast<int>(dst_dtype_bool)}});
-      runner_cast_mask_bool.Run(stream);
-
-      // cast cast_mask from bool to uint8
-      auto dst_dtype_uint8 =
-          ConvertToNpuDtype(framework::TransToProtoVarType(mask->dtype()));
-      const auto& runner_cast_mask_uint8 =
-          NpuOpRunner("Cast",
-                      {cast_mask},
-                      {*mask},
-                      {{"dst_type", static_cast<int>(dst_dtype_uint8)}});
-      runner_cast_mask_uint8.Run(stream);
-    } else {
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DropoutGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(is_test,
-                      false,
-                      platform::errors::PreconditionNotMet(
-                          "GradOp is only callable when is_test is false"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dropout_prob == 1.) {
-      const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx});
-      runner_zeros.Run(stream);
-      return;
-    }
-
-    // cast mask from uint8 to float32/float16
-    phi::DenseTensor cast_mask(dx->dtype());
-    cast_mask.Resize(mask->dims());
-    cast_mask.mutable_data<T>(ctx.GetPlace());
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(dx->dtype()));
-    const auto& runner_cast_mask =
-        NpuOpRunner("Cast",
-                    {*mask},
-                    {cast_mask},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_mask.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("MaskedScale",
-                    {*dout, cast_mask},
-                    {*dx},
-                    {{"value", static_cast<float>(1. / (1 - dropout_prob))}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    dropout,
-    ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank,
-                      rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank,
-                          rank));
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
-                                          "expand_as_v2 op must be positive.",
-                                          rank));
-    PADDLE_ENFORCE_LE(target_rank,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank,
-                          MAX_RANK_SUPPORTED));
-    ExpandAs(context);
-  }
-
- protected:
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
-        PADDLE_ENFORCE_EQ(
-            vec_in_dims[i],
-            target_shape[i],
-            platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i],
-                target_shape[i]));
-      }
-    }
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}});
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    runner.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for Op(expand) "
-            "must be greater than or equal to 1, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for Op(expand) "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    switch (rank) {
-      case 1:
-        Expand<1>(context);
-        break;
-      case 2:
-        Expand<2>(context);
-        break;
-      case 3:
-        Expand<3>(context);
-        break;
-      case 4:
-        Expand<4>(context);
-        break;
-      case 5:
-        Expand<5>(context);
-        break;
-      case 6:
-        Expand<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto expand_times = get_expand_times(context);
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(in_dims.size()),
-                      expand_times.size(),
-                      platform::errors::InvalidArgument(
-                          "The number of elements (%d) of 'expand_times' for "
-                          "Op(expand) must be equal to the number "
-                          "of dimensions (%d) of the input.",
-                          expand_times.size(),
-                          static_cast<size_t>(in_dims.size())));
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-    framework::DDim out_dims(in_dims);
-
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      out_dims[i] *= expand_times[i];
-    }
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(place);
-
-    bool is_expand_times_all_one =
-        (out0->numel() == in0->numel()) ? true : false;
-
-    if (is_expand_times_all_one) {
-      memory::Copy(place,
-                   out0->mutable_data<T>(place),
-                   place,
-                   in0->data<T>(),
-                   in0->numel() * sizeof(T),
-                   stream);
-      if (out_dims != in_dims) {
-        out0->Resize(out_dims);
-      }
-    } else {
-      const auto& runner =
-          NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <iostream>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(expand);
-USE_OP_DEVICE_KERNEL(expand, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto in = scope->Var("X");
-  auto expand_times = scope->Var("ExpandTimes");
-  auto out = scope->Var("Out");
-  auto in_t = in->GetMutable<phi::DenseTensor>();
-  auto out_t = out->GetMutable<phi::DenseTensor>();
-  auto expand_times_t = expand_times->GetMutable<phi::DenseTensor>();
-
-  auto place = ctx.GetPlace();
-  paddle::framework::TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
-  paddle::framework::TensorFromVector(
-      std::vector<int>({1, 10, 1}), ctx, expand_times_t);
-
-  in_t->Resize(phi::make_ddim({3, 1, 7}));
-  expand_times_t->Resize(phi::make_ddim({3}));
-  out_t->Resize(phi::make_ddim({3, 10, 7}));
-  out_t->mutable_data<T>(place);
-
-  f::AttributeMap attrs = {{}};
-  auto op =
-      f::OpRegistry::CreateOp("expand",
-                              {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
-                              {{"Out", {"Out"}}},
-                              attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  auto out_dim = out_t->dims();
-  EXPECT_EQ(out_dim.at(0), 3);
-  EXPECT_EQ(out_dim.at(1), 10);
-  EXPECT_EQ(out_dim.at(2), 7);
-}
-
-TEST(expand, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto in_dims = X->dims();
-    auto expand_shape = get_expand_shape(ctx);
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> final_expand_shape(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(expand_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The expanded size cannot be zero."));
-      if (i < diff) {  // expand_shape = [3,4,-1,-1], X = [10,2] -->
-                       // final_expand_shape = [3,4,10,2]
-        PADDLE_ENFORCE_GT(
-            expand_shape[i],
-            0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_v2 op.",
-                expand_shape[i]));
-        final_expand_shape[i] = expand_shape[i];
-      } else if (expand_shape[i] > 0) {  // expand_shape = [3,4,10,4], X =
-                                         // [10,1] --> final_expand_shape =
-                                         // [3,4,10,4]
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i],
-              expand_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_v2 op.",
-                  vec_in_dims[i],
-                  expand_shape[i]));
-          final_expand_shape[i] = expand_shape[i];
-        } else {
-          final_expand_shape[i] = expand_shape[i];
-        }
-      } else {  // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
-                // = [3,4,10,2]
-        PADDLE_ENFORCE_EQ(
-            expand_shape[i],
-            -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                expand_shape[i]));
-        final_expand_shape[i] = vec_in_dims[i];
-      }
-    }
-
-    framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}};
-
-    auto rank = X->dims().size();
-
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_npu op must be positive, "
-            "but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_npu op must be less than "
-            "or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto shape_size = final_expand_shape.size();
-    PADDLE_ENFORCE_GE(
-        shape_size,
-        rank,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2_npu op must "
-            "be "
-            "greater than or equal to the rank (%d) of the input 'X'.",
-            shape_size,
-            rank));
-    PADDLE_ENFORCE_LE(shape_size,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The number (%d) of elements of 'shape' for "
-                          "expand_v2_npu op must be "
-                          "less than or equal to %d.",
-                          shape_size,
-                          MAX_RANK_SUPPORTED));
-
-    framework::DDim out_dims = phi::make_ddim(final_expand_shape);
-    Out->Resize(out_dims);
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                      const std::vector<phi::DenseTensor>& outputs,
-                      const NPUAttributeMap& attrs,
-                      const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(X->dtype()) ==
-        framework::proto::VarType::BOOL) {
-      NpuOpRunner::TypeAdapter({*X},
-                               {*Out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::UINT8},
-                               {framework::proto::VarType::UINT8});
-    } else if (framework::TransToProtoVarType(X->dtype()) ==
-               framework::proto::VarType::INT64) {
-      NpuOpRunner::TypeAdapter({*X},
-                               {*Out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // case 1: reduce dout dims to dx dims
-    // For example: [2, 120] --> [120]
-    auto reduce_ndim = dout->dims().size() - dx->dims().size();
-    std::vector<int> axes;
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-
-    phi::DenseTensor tmp_dout(dout->dtype());
-    phi::DenseTensor reduced_dout(dx->dtype());
-    tmp_dout.ShareDataWith(*dout);
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
-      }
-      tmp_dout.Resize(phi::make_ddim(reduced_dout_dims));
-      reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      const auto& runner = NpuOpRunner("ReduceSumD",
-                                       {*dout},
-                                       {reduced_dout},
-                                       {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = reduced_dout;
-    }
-
-    // case 2: reduce axis of dout in which dim is 1
-    // For example: [12, 140] --> [1, 140]
-
-    // case 3: copy dout to dx when shape is totally same, and dim in dx != 1
-    // For example: [2, 10, 5] --> [2, 10, 5]
-    axes.clear();
-    for (auto i = 0; i < dx->dims().size(); ++i) {
-      if (dx->dims()[i] == 1) {
-        axes.push_back(i);
-      }
-    }
-    if (axes.size() != 0) {
-      const auto& runner = NpuOpRunner("ReduceSumD",
-                                       {tmp_dout},
-                                       {*dx},
-                                       {{"axes", axes}, {"keep_dims", true}});
-      runner.Run(stream);
-    } else {
-      framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand_v2,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, bool>);
-
-REGISTER_OP_NPU_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, int>);
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class EyeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-
-    auto d_nums = ctx.Attr<int>("dtype");
-    auto dtype =
-        ConvertToNpuDtype(static_cast<framework::proto::VarType::Type>(d_nums));
-
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    framework::NPUAttributeMap attr_input = {
-        {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}};
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    eye,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float,
-                                T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()),
-            value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value),
-        false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
-    tensor_tmp.mutable_data<T>({1}, context.GetPlace());
-    FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto shape = out->dims();
-    NpuOpRunner runner;
-    runner.SetType("Fill")
-        .AddInput(phi::vectorize(shape))
-        .AddInput(tensor_tmp)
-        .AddOutput(*out)
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(fill_any_like,
-                       ops::FillAnyLikeNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::FillAnyLikeNPUKernel<int64_t>,
-#endif
-                       ops::FillAnyLikeNPUKernel<float>,
-                       ops::FillAnyLikeNPUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    auto *in = ctx.Input<phi::DenseTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the phi::DenseTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<phi::CPUContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
-              out,
-              static_cast<T>(value));
-    } else {
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type));
-      tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_tmp, value);
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      const auto &runner = NpuOpRunner("FillD",
-                                       {tensor_tmp},
-                                       {*out},
-                                       {{"dims", phi::vectorize(out->dims())}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(fill_constant_batch_size_like,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           float>,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           int>,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillConstantNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto float_value = ctx.Attr<float>("value");
-
-    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    auto shape = GetShape(ctx);
-
-    out_var->mutable_data<T>(shape, ctx.GetPlace());
-    if (data_type != framework::proto::VarType::BOOL) {
-      Tensor tensor_value(framework::TransToPhiDataType(data_type));
-      tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_value, value);
-      NpuOpRunner runner;
-      runner.SetType("Fill")
-          .AddInput(phi::vectorize(shape))
-          .AddInput(tensor_value)
-          .AddOutput(*out_var)
-          .Run(stream);
-    } else {
-      const auto &dev_ctx =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>();
-      auto op_func = [&shape, &value](
-                         const std::vector<Tensor> &inputs,
-                         const std::vector<Tensor> &outputs,
-                         const NPUAttributeMap &attrs,
-                         const platform::NPUDeviceContext &dev_ctx) {
-        Tensor tensor_value;
-        tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
-        FillNpuTensorWithConstant<uint8_t>(&tensor_value,
-                                           static_cast<uint8_t>(value));
-
-        NpuOpRunner runner;
-        runner.SetType("Fill")
-            .AddInput(phi::vectorize(shape))
-            .AddInput(tensor_value)
-            .AddOutput(outputs[0])
-            .Run(dev_ctx.stream());
-      };
-      NpuOpRunner::TypeAdapter({},
-                               {*out_var},
-                               {},
-                               dev_ctx,
-                               op_func,
-                               {},
-                               {framework::proto::VarType::UINT8});
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    fill_constant,
-    paddle::operators::FillConstantNPUKernel<float>,
-    paddle::operators::FillConstantNPUKernel<bool>,
-    paddle::operators::FillConstantNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::FillConstantNPUKernel<int64_t>,
-#endif
-    paddle::operators::FillConstantNPUKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_zeros_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillZerosLikeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(context.GetPlace());
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*out});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ b/paddle/fluid/operators/flatten_op_npu.cc
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class GatherNdNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->template mutable_data<T>(ctx.GetPlace());
-
-    if (x->numel() == 0) return;
-
-    if (index->numel() == 0) {
-      framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
-      return;
-    }
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {});
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class GatherNdGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *p = dx->mutable_data<T>(ctx.GetPlace());
-
-    if (dx->numel() == 0) return;
-
-    if (index->numel() == 0) {
-      framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx);
-      return;
-    }
-
-    phi::DenseTensor tmp_tensor(index->type());
-    phi::DenseTensor tmp_tensor2(dout->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {1, index_dims[0]};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-
-      tmp_tensor2.ShareDataWith(*dout);
-      std::vector<int64_t> new_dim2{1};
-      for (int i = index->numel(); i < x->dims().size(); i++) {
-        new_dim2.push_back(x->dims()[i]);
-      }
-      tmp_tensor2.Resize(phi::make_ddim(new_dim2));
-      dout = &tmp_tensor2;
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    platform::NPUMemsetAsync(
-        static_cast<void *>(p), 0, dx->numel() * sizeof(T), stream);
-
-    const auto &runner_scatter = NpuOpRunner(
-        "ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}});
-    runner_scatter.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(gather_nd,
-                       ops::GatherNdNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::GatherNdNPUKernel<int64_t>,
-#endif
-                       ops::GatherNdNPUKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(gather_nd_grad,
-                       ops::GatherNdGradNPUKernel<paddle::platform::float16>,
-                       ops::GatherNdGradNPUKernel<float>);
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_npu.cc
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ b/paddle/fluid/operators/is_empty_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    is_empty,
-    ops::IsEmptyOpKernel<plat::NPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<plat::NPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
--- a/paddle/fluid/operators/load_combine_op_npu.cc
+++ b/paddle/fluid/operators/load_combine_op_npu.cc
--- a/paddle/fluid/operators/load_op_npu.cc
+++ b/paddle/fluid/operators/load_op_npu.cc
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ b/paddle/fluid/operators/log_softmax_op_npu.cc
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ b/paddle/fluid/operators/masked_select_op_npu.cc
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ b/paddle/fluid/operators/norm_op_npu.cc
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ b/paddle/fluid/operators/p_norm_op_npu.cc
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
--- a/paddle/fluid/operators/randperm_op_npu.cc
+++ b/paddle/fluid/operators/randperm_op_npu.cc
--- a/paddle/fluid/operators/range_op_npu.cc
+++ b/paddle/fluid/operators/range_op_npu.cc
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ b/paddle/fluid/operators/reshape_op_npu.cc
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
--- a/paddle/fluid/operators/run_program_op_npu.cc
+++ b/paddle/fluid/operators/run_program_op_npu.cc
--- a/paddle/fluid/operators/sampling_id_op_npu.cc
+++ b/paddle/fluid/operators/sampling_id_op_npu.cc
--- a/paddle/fluid/operators/save_combine_op_npu.cc
+++ b/paddle/fluid/operators/save_combine_op_npu.cc
--- a/paddle/fluid/operators/save_op_npu.cc
+++ b/paddle/fluid/operators/save_op_npu.cc
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
--- a/paddle/fluid/operators/seed_op_npu.cc
+++ b/paddle/fluid/operators/seed_op_npu.cc
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
--- a/paddle/fluid/operators/size_op_npu.cc
+++ b/paddle/fluid/operators/size_op_npu.cc
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
--- a/paddle/fluid/operators/squeeze_op_npu.cc
+++ b/paddle/fluid/operators/squeeze_op_npu.cc
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
--- a/paddle/fluid/operators/unsqueeze_op_npu.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu.cc
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ b/paddle/fluid/operators/unstack_op_npu.cc
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc