remove *npu.cc (#53342)

b305629c · 陈沧夜 · GitHub · cf6ed7cb · cf6ed7cb · cf6ed7cb
4 changed file
--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/impl/box_coder.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct BoxCoderFunction {
- public:
-  explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-  }
-  phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    phi::DenseTensor z;
-    z.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
-    runner.Run(stream);
-    return z;
-  }
-  phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    z.mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
-    runner.Run(stream);
-    return z;
-  }
-  void DivWithBroadCastVoid(const phi::DenseTensor& x,
-                            const phi::DenseTensor& y,
-                            const framework::DDim& shape,
-                            phi::DenseTensor* z) {
-    z->mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
-    runner.Run(stream);
-  }
-  phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    DivWithBroadCastVoid(x, y, shape, &z);
-    return z;
-  }
-  void MulWithBroadCastVoid(const phi::DenseTensor& x,
-                            const phi::DenseTensor& y,
-                            const framework::DDim& shape,
-                            phi::DenseTensor* z) {
-    z->mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
-    runner.Run(stream);
-  }
-  phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    MulWithBroadCastVoid(x, y, shape, &z);
-    return z;
-  }
-  void AddWithBroadCastVoid(const phi::DenseTensor& x,
-                            const phi::DenseTensor& y,
-                            const framework::DDim& shape,
-                            phi::DenseTensor* z) {
-    z->mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
-    runner.Run(stream);
-  }
-  phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    AddWithBroadCastVoid(x, y, shape, &z);
-    return z;
-  }
-  phi::DenseTensor Abs(const phi::DenseTensor& x) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Log(const phi::DenseTensor& x) {
-    phi::DenseTensor t_x_m1 = Adds(x, -1);
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Exp(const phi::DenseTensor& x) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    auto dim_x = x.dims();
-    auto dim_y = y.dims();
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "x should be a 2-dim tensor, but got %d-dim.", dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_y.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "y should be a 2-dim tensor, but got %d-dim.", dim_y.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_x[1],
-        dim_y[0],
-        platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but "
-                                          "got dim_x[1] = %d, dim_y[0] = %d.",
-                                          dim_x[1],
-                                          dim_y[0]));
-    phi::DenseTensor z;
-    z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
-    const auto& runner =
-        NpuOpRunner("MatMul",
-                    {x, y},
-                    {z},
-                    {{"transpose_x1", false}, {"transpose_x2", false}});
-    runner.Run(stream);
-    return z;
-  }
-  void ConcatVoid(const std::vector<phi::DenseTensor>& inputs,
-                  const framework::DDim& shape_out,
-                  int axis,
-                  phi::DenseTensor* output) {
-    output->mutable_data<T>(shape_out, place);
-    std::vector<std::string> names;
-    for (size_t i = 0; i < inputs.size(); i++) {
-      names.push_back("x" + std::to_string(i));
-    }
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*output},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-  phi::DenseTensor Concat(const std::vector<phi::DenseTensor>& inputs,
-                          const framework::DDim& shape_out,
-                          int axis) {
-    phi::DenseTensor output;
-    ConcatVoid(inputs, shape_out, axis, &output);
-    return output;
-  }
-  phi::DenseTensor Slice(const phi::DenseTensor& x,
-                         const std::vector<int>& offsets,
-                         const std::vector<int>& size,
-                         const framework::DDim& shape) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(shape, place);
-    const auto& runner =
-        NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
-    runner.Run(stream);
-    return y;
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-void Vector2Tensor(const framework::ExecutionContext& ctx,
-                   const std::vector<T>& vec,
-                   const framework::DDim& ddim,
-                   phi::DenseTensor* tsr) {
-  framework::TensorFromVector<T>(vec, ctx.device_context(), tsr);
-  ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  tsr->Resize(ddim);
-}
-
-template <typename T>
-void BoxCoderEnc(const framework::ExecutionContext& ctx,
-                 const phi::DenseTensor* tb,
-                 const phi::DenseTensor* pb,
-                 const phi::DenseTensor* pbv,
-                 const bool norm,
-                 const std::vector<float>& variance,
-                 phi::DenseTensor* out) {
-  auto M = pb->dims()[0];
-  auto N = tb->dims()[0];
-  auto shape_0 = phi::make_ddim({4, 2});
-  phi::DenseTensor m_diff;
-  phi::DenseTensor m_aver;
-  std::vector<T> vec_diff = {static_cast<T>(-1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(-1),
-                             static_cast<T>(1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(1)};
-  std::vector<T> vec_aver = {static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5)};
-  Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
-  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
-
-  BoxCoderFunction<T> F(ctx);
-  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  phi::DenseTensor tb_xy = F.Dot(*tb, m_aver);
-  phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
-
-  pb_xy.Resize({1, M, 2});
-  pb_wh.Resize({1, M, 2});
-  tb_xy.Resize({N, 1, 2});
-  tb_wh.Resize({N, 1, 2});
-
-  auto shape_half = phi::make_ddim({N, M, 2});
-  auto shape_full = phi::make_ddim({N, M, 4});
-
-  phi::DenseTensor out_xy_0 = F.DivWithBroadCast(
-      F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
-  phi::DenseTensor out_wh_0 =
-      F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
-  phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
-
-  if (pbv) {
-    F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
-  } else {
-    phi::DenseTensor t_var;
-    std::vector<T> vec_var(4);
-    for (auto i = 0; i < 4; i++) {
-      vec_var[i] = static_cast<T>(variance[i]);
-    }
-    Vector2Tensor(ctx, vec_var, phi::make_ddim({1, 1, 4}), &t_var);
-    F.DivWithBroadCastVoid(out_0, t_var, shape_full, out);
-  }
-}
-
-template <typename T>
-void BoxCoderDec(const framework::ExecutionContext& ctx,
-                 const phi::DenseTensor* tb,
-                 const phi::DenseTensor* pb,
-                 const phi::DenseTensor* pbv,
-                 const bool norm,
-                 const std::vector<float>& variance,
-                 int axis,
-                 phi::DenseTensor* out) {
-  auto shape_0 = phi::make_ddim({4, 2});
-  phi::DenseTensor m_diff;
-  phi::DenseTensor m_aver;
-  std::vector<T> vec_diff = {static_cast<T>(-1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(-1),
-                             static_cast<T>(1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(1)};
-  std::vector<T> vec_aver = {static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5)};
-  Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
-  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
-
-  BoxCoderFunction<T> F(ctx);
-  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2})
-                                   : phi::make_ddim({pb->dims()[0], 1, 2});
-  pb_xy.Resize(pb_resize_shape);
-  pb_wh.Resize(pb_resize_shape);
-
-  auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2});
-  std::vector<int> tbox_slice_size = {
-      static_cast<int>(tb->dims()[0]), static_cast<int>(tb->dims()[1]), 2};
-  phi::DenseTensor tbox01 =
-      F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
-  phi::DenseTensor tbox23 =
-      F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
-
-  phi::DenseTensor tb_xy;
-  phi::DenseTensor tb_wh;
-  if (pbv) {
-    auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2});
-    auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2})
-                                       : phi::make_ddim({pbv->dims()[0], 1, 2});
-    std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
-    phi::DenseTensor pbv_t01 =
-        F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
-    phi::DenseTensor pbv_t23 =
-        F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
-    pbv_t01.Resize(pbvt_resize_shape);
-    pbv_t23.Resize(pbvt_resize_shape);
-
-    F.AddWithBroadCastVoid(
-        F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape),
-        pb_xy,
-        tbox_slice_shape,
-        &tb_xy);
-    F.MulWithBroadCastVoid(
-        F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)),
-        pb_wh,
-        tbox_slice_shape,
-        &tb_wh);
-  } else if (variance.empty()) {
-    F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape),
-                           pb_xy,
-                           tbox_slice_shape,
-                           &tb_xy);
-    F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
-  } else {
-    phi::DenseTensor t_var01, t_var23;
-    auto t_var_shape = phi::make_ddim({1, 1, 2});
-    std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
-                                static_cast<T>(variance[1])};
-    std::vector<T> vec_var23 = {static_cast<T>(variance[2]),
-                                static_cast<T>(variance[3])};
-    Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01);
-    Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23);
-    F.AddWithBroadCastVoid(
-        F.MulWithBroadCast(tbox01,
-                           F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape),
-                           tbox_slice_shape),
-        pb_xy,
-        tbox_slice_shape,
-        &tb_xy);
-    F.MulWithBroadCastVoid(
-        F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)),
-        pb_wh,
-        tbox_slice_shape,
-        &tb_wh);
-  }
-  phi::DenseTensor obox01 =
-      F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
-  phi::DenseTensor obox23 =
-      F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
-             (norm ? 0 : -1));
-  F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
-}
-
-template <typename T>
-class BoxCoderNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* prior_box = ctx.Input<phi::DenseTensor>("PriorBox");
-    auto* prior_box_var = ctx.Input<phi::DenseTensor>("PriorBoxVar");
-    auto* target_box = ctx.Input<phi::DenseTensor>("TargetBox");
-    auto* output_box = ctx.Output<phi::DenseTensor>("OutputBox");
-    std::vector<float> variance = ctx.Attr<std::vector<float>>("variance");
-    const int axis = ctx.Attr<int>("axis");
-
-    if (prior_box_var) {
-      PADDLE_ENFORCE_EQ(variance.empty(),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Input 'PriorBoxVar' and attribute 'variance'"
-                            " of BoxCoder operator should not be used at the "
-                            "same time."));
-    }
-    if (!(variance.empty())) {
-      PADDLE_ENFORCE_EQ(static_cast<int>(variance.size()),
-                        4,
-                        platform::errors::InvalidArgument(
-                            "Size of attribute 'variance' in BoxCoder operator"
-                            " should be 4. But received size is %d",
-                            variance.size()));
-    }
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input 'TargetBox' of BoxCoder operator only"
-                            " supports LoD with one level."));
-    }
-
-    auto code_type =
-        phi::funcs::GetBoxCodeType(ctx.Attr<std::string>("code_type"));
-    bool normalized = ctx.Attr<bool>("box_normalized");
-
-    if (code_type == phi::funcs::BoxCodeType::kEncodeCenterSize) {
-      BoxCoderEnc<T>(ctx,
-                     target_box,
-                     prior_box,
-                     prior_box_var,
-                     normalized,
-                     variance,
-                     output_box);
-    } else {
-      BoxCoderDec<T>(ctx,
-                     target_box,
-                     prior_box,
-                     prior_box_var,
-                     normalized,
-                     variance,
-                     axis,
-                     output_box);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(box_coder,
-                       ops::BoxCoderNPUKernel<float>,
-                       ops::BoxCoderNPUKernel<plat::float16>);
--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/density_prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-using fp16 = paddle::platform::float16;
-
-template <typename T>
-struct DensityPriorBoxFunction {
- public:
-  explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<platform::NPUDeviceContext>().stream();
-    t0.mutable_data<float>({1}, place);
-    t1.mutable_data<float>({1}, place);
-    tn.mutable_data<float>({1}, place);
-    FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
-    FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
-  }
-  void Arange(int n, phi::DenseTensor* x) {
-    //  x should be init first
-    FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-    const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(y->type()));
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Concat(const std::vector<phi::DenseTensor>& inputs,
-              int axis,
-              phi::DenseTensor* output) {
-    //  output should be init first
-    std::vector<std::string> names;
-    for (size_t i = 0; i < inputs.size(); i++) {
-      names.push_back("x" + std::to_string(i));
-    }
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*output},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-  void Tile(const phi::DenseTensor* x,
-            phi::DenseTensor* y,
-            const std::vector<int>& multiples) {
-    //  y should be init first
-    if (x->dims() == y->dims()) {
-      framework::TensorCopy(
-          *x,
-          place,
-          ctx.template device_context<platform::NPUDeviceContext>(),
-          y);
-      return;
-    }
-    const auto& runner =
-        NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}});
-    runner.Run(stream);
-  }
-  void FloatVec2Tsr(const std::vector<float>& vec, phi::DenseTensor* tsr_dst) {
-    //
-    framework::TensorFromVector<T>(vec, ctx.device_context(), tsr_dst);
-    ctx.template device_context<platform::NPUDeviceContext>().Wait();
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-  phi::DenseTensor t0;
-  phi::DenseTensor t1;
-  phi::DenseTensor tn;
-};
-
-template <>
-void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  phi::DenseTensor x_fp32(phi::DataType::FLOAT32);
-  x_fp32.mutable_data<float>(x->dims(), place);
-  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
-  runner.Run(stream);
-  Cast(&x_fp32, x);
-}
-
-template <>
-void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
-                                                 phi::DenseTensor* tsr_dst) {
-  phi::DenseTensor tsr_fp32(phi::DataType::FLOAT32);
-  tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
-  framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
-  ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  Cast(&tsr_fp32, tsr_dst);
-}
-
-template <typename T>
-class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* image = ctx.Input<phi::DenseTensor>("Image");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
-
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto clip = ctx.Attr<bool>("clip");
-
-    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
-    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
-    auto densities = ctx.Attr<std::vector<int>>("densities");
-
-    float step_w = ctx.Attr<float>("step_w");
-    float step_h = ctx.Attr<float>("step_h");
-    float offset = ctx.Attr<float>("offset");
-
-    int image_w = image->dims()[3];
-    int image_h = image->dims()[2];
-    int layer_w = input->dims()[3];
-    int layer_h = input->dims()[2];
-
-    auto _type = input->dtype();
-    auto place = ctx.GetPlace();
-    DensityPriorBoxFunction<T> F(ctx);
-
-    phi::DenseTensor h(_type);
-    h.mutable_data<T>({layer_h}, place);
-    phi::DenseTensor w(_type);
-    w.mutable_data<T>({layer_w}, place);
-    F.Arange(layer_h, &h);
-    F.Arange(layer_w, &w);
-    h.Resize({layer_h, 1, 1, 1});
-    w.Resize({1, layer_w, 1, 1});
-
-    step_w = step_w > 0 ? step_w : static_cast<float>(image_w) / layer_w;
-    step_h = step_h > 0 ? step_h : static_cast<float>(image_h) / layer_h;
-    int step_average = static_cast<int>((step_w + step_h) * 0.5);
-
-    int ratios_size = fixed_ratios.size();
-    int num_priors_per_ratio = 0;
-    for (size_t i = 0; i < densities.size(); ++i) {
-      num_priors_per_ratio += densities[i] * densities[i];
-    }
-    phi::DenseTensor di(_type);
-    phi::DenseTensor dj(_type);
-    phi::DenseTensor shifts(_type);
-    phi::DenseTensor box_w_ratio(_type);
-    phi::DenseTensor box_h_ratio(_type);
-    di.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    dj.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    shifts.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    box_w_ratio.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    box_h_ratio.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-
-    int64_t start = 0;
-    std::vector<int> vec_tile = {0, 0, 0};
-    for (size_t i = 0; i < densities.size(); ++i) {
-      //  Range = start:start+ratios_size*density_sqr, density = densities[i]
-      int density_sqr = densities[i] * densities[i];
-      //  shifts[Range] = [step_average/density]*ratios_size*density_sqr
-      phi::DenseTensor shifts_part =
-          shifts.Slice(start, start + ratios_size * density_sqr);
-      FillNpuTensorWithConstant<T>(&shifts_part,
-                                   static_cast<T>(step_average / densities[i]));
-
-      //  di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size
-      //  dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size
-      phi::DenseTensor di_part =
-          di.Slice(start, start + ratios_size * density_sqr);
-      phi::DenseTensor dj_part =
-          dj.Slice(start, start + ratios_size * density_sqr);
-      if (densities[i] > 1) {
-        di_part.Resize({ratios_size, densities[i], densities[i]});
-        dj_part.Resize({ratios_size, densities[i], densities[i]});
-        phi::DenseTensor range_n(_type);
-        range_n.mutable_data<T>({densities[i]}, place);
-        F.Arange(densities[i], &range_n);
-        range_n.Resize({1, densities[i], 1});
-        vec_tile[0] = ratios_size;
-        vec_tile[1] = 1;
-        vec_tile[2] = densities[i];
-        F.Tile(&range_n, &di_part, vec_tile);
-        range_n.Resize({1, 1, densities[i]});
-        vec_tile[1] = densities[i];
-        vec_tile[2] = 1;
-        F.Tile(&range_n, &dj_part, vec_tile);
-      } else {
-        FillNpuTensorWithConstant<T>(&di_part, static_cast<T>(0));
-        FillNpuTensorWithConstant<T>(&dj_part, static_cast<T>(0));
-      }
-
-      int start_box_ratio = start;
-      for (float ar : fixed_ratios) {
-        //  Range_mini = start_box_ratio:start_box_ratio+density_sqr
-        //  box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)]  * density_sqr
-        //  box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)]  * density_sqr
-        phi::DenseTensor box_h_ratio_part =
-            box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        phi::DenseTensor box_w_ratio_part =
-            box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        FillNpuTensorWithConstant<T>(&box_w_ratio_part,
-                                     static_cast<T>(fixed_sizes[i] * sqrt(ar)));
-        FillNpuTensorWithConstant<T>(&box_h_ratio_part,
-                                     static_cast<T>(fixed_sizes[i] / sqrt(ar)));
-        start_box_ratio += density_sqr;
-      }
-      start = start_box_ratio;
-    }
-    di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-
-    //  c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts
-    //  c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts
-    phi::DenseTensor c_x(_type);
-    phi::DenseTensor c_y(_type);
-    auto dim0 =
-        phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1});
-    auto dim1 =
-        phi::make_ddim({layer_h, 1, ratios_size * num_priors_per_ratio, 1});
-    c_x.mutable_data<T>(dim0, place);
-    c_y.mutable_data<T>(dim1, place);
-    F.Adds(&w, offset, &w);
-    F.Muls(&w, step_w, &w);
-    F.Adds(&w, static_cast<float>(-step_average) * static_cast<float>(0.5), &w);
-    F.Adds(&h, offset, &h);
-    F.Muls(&h, step_h, &h);
-    F.Adds(&h, static_cast<float>(-step_average) * static_cast<float>(0.5), &h);
-    F.Mul(&di, &shifts, &di);
-    F.Mul(&dj, &shifts, &dj);
-    F.Muls(&shifts, static_cast<float>(0.5), &shifts);
-    F.Add(&di, &shifts, &di);
-    F.Add(&dj, &shifts, &dj);
-    F.Add(&dj, &w, &c_x);
-    F.Add(&di, &h, &c_y);
-
-    //  box_w_ratio = box_w_ratio / 2
-    //  box_h_ratio = box_h_ratio / 2
-    F.Muls(&box_w_ratio, static_cast<float>(0.5), &box_w_ratio);
-    F.Muls(&box_h_ratio, static_cast<float>(0.5), &box_h_ratio);
-
-    phi::DenseTensor zero_t(_type);
-    phi::DenseTensor one_t(_type);
-    zero_t.mutable_data<T>({1}, place);
-    one_t.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
-    FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
-
-    phi::DenseTensor outbox0(_type);
-    phi::DenseTensor outbox1(_type);
-    phi::DenseTensor outbox2(_type);
-    phi::DenseTensor outbox3(_type);
-    outbox0.mutable_data<T>(dim0, place);
-    outbox1.mutable_data<T>(dim1, place);
-    outbox2.mutable_data<T>(dim0, place);
-    outbox3.mutable_data<T>(dim1, place);
-
-    //  outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 )
-    //  outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 )
-    //  outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 )
-    //  outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 )
-    F.Sub(&c_x, &box_w_ratio, &outbox0);
-    F.Sub(&c_y, &box_h_ratio, &outbox1);
-    F.Add(&c_x, &box_w_ratio, &outbox2);
-    F.Add(&c_y, &box_h_ratio, &outbox3);
-    F.Muls(&outbox0, static_cast<float>(1.0 / image_w), &outbox0);
-    F.Muls(&outbox1, static_cast<float>(1.0 / image_h), &outbox1);
-    F.Muls(&outbox2, static_cast<float>(1.0 / image_w), &outbox2);
-    F.Muls(&outbox3, static_cast<float>(1.0 / image_h), &outbox3);
-
-    F.Maximum(&outbox0, &zero_t, &outbox0);
-    F.Maximum(&outbox1, &zero_t, &outbox1);
-    F.Minimum(&outbox2, &one_t, &outbox2);
-    F.Minimum(&outbox3, &one_t, &outbox3);
-    if (clip) {
-      //  outbox0 = min ( outbox0, 1 )
-      //  outbox1 = min ( outbox1, 1 )
-      //  outbox2 = max ( outbox2, 0 )
-      //  outbox3 = max ( outbox3, 0 )
-      F.Minimum(&outbox0, &one_t, &outbox0);
-      F.Minimum(&outbox1, &one_t, &outbox1);
-      F.Maximum(&outbox2, &zero_t, &outbox2);
-      F.Maximum(&outbox3, &zero_t, &outbox3);
-    }
-
-    auto out_dim = phi::make_ddim(
-        {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4});
-    boxes->mutable_data<T>(place);
-    vars->mutable_data<T>(place);
-    phi::DenseTensor boxes_share(_type);
-    phi::DenseTensor vars_share(_type);
-    boxes_share.ShareDataWith(*boxes);
-    boxes_share.Resize(out_dim);
-    vars_share.ShareDataWith(*vars);
-    vars_share.Resize(out_dim);
-
-    phi::DenseTensor box0(_type);
-    phi::DenseTensor box1(_type);
-    phi::DenseTensor box2(_type);
-    phi::DenseTensor box3(_type);
-    // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1}
-    out_dim[3] = 1;
-    box0.mutable_data<T>(out_dim, place);
-    box1.mutable_data<T>(out_dim, place);
-    box2.mutable_data<T>(out_dim, place);
-    box3.mutable_data<T>(out_dim, place);
-
-    std::vector<int> vec_exp_out02 = {layer_h, 1, 1, 1};
-    std::vector<int> vec_exp_out13 = {1, layer_w, 1, 1};
-    F.Tile(&outbox0, &box0, vec_exp_out02);
-    F.Tile(&outbox1, &box1, vec_exp_out13);
-    F.Tile(&outbox2, &box2, vec_exp_out02);
-    F.Tile(&outbox3, &box3, vec_exp_out13);
-    F.Concat({box0, box1, box2, box3}, 3, &boxes_share);
-
-    std::vector<int> multiples = {
-        layer_h, layer_w, ratios_size * num_priors_per_ratio, 1};
-    phi::DenseTensor variances_t(_type);
-    //  variances.size() == 4
-    variances_t.mutable_data<T>({4}, place);
-    F.FloatVec2Tsr(variances, &variances_t);
-    F.Tile(&variances_t, &vars_share, multiples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(density_prior_box,
-                       ops::DensityPriorBoxOpNPUKernel<plat::float16>,
-                       ops::DensityPriorBoxOpNPUKernel<float>);
--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct IouFunction {
- public:
-  explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    //  y should be init first
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void DivNoNan(const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-class IouSimilarityNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    bool normalized = ctx.Attr<bool>("box_normalized");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto _type = x->dtype();
-    auto place = ctx.GetPlace();
-
-    IouFunction<T> F(ctx);
-
-    auto N = x->dims()[0];
-    auto M = y->dims()[0];
-
-    out->mutable_data<T>({N, M}, place);
-    phi::DenseTensor xt(_type);
-    phi::DenseTensor yt(_type);
-    xt.mutable_data<T>({4, N}, place);
-    yt.mutable_data<T>({4, M}, place);
-    std::vector<int> vec_trans = {1, 0};
-    F.Transpose(x, &xt, vec_trans);
-    F.Transpose(y, &yt, vec_trans);
-    phi::DenseTensor xmin1 = xt.Slice(0, 1);
-    phi::DenseTensor ymin1 = xt.Slice(1, 2);
-    phi::DenseTensor xmax1 = xt.Slice(2, 3);
-    phi::DenseTensor ymax1 = xt.Slice(3, 4);
-    phi::DenseTensor xmin2 = yt.Slice(0, 1);
-    phi::DenseTensor ymin2 = yt.Slice(1, 2);
-    phi::DenseTensor xmax2 = yt.Slice(2, 3);
-    phi::DenseTensor ymax2 = yt.Slice(3, 4);
-    xmin1.Resize({N, 1});
-    ymin1.Resize({N, 1});
-    xmax1.Resize({N, 1});
-    ymax1.Resize({N, 1});
-    xmin2.Resize({1, M});
-    ymin2.Resize({1, M});
-    xmax2.Resize({1, M});
-    ymax2.Resize({1, M});
-
-    phi::DenseTensor w1(_type);
-    phi::DenseTensor h1(_type);
-    phi::DenseTensor w2(_type);
-    phi::DenseTensor h2(_type);
-    phi::DenseTensor area1(_type);
-    phi::DenseTensor area2(_type);
-    w1.mutable_data<T>({N, 1}, place);
-    h1.mutable_data<T>({N, 1}, place);
-    w2.mutable_data<T>({1, M}, place);
-    h2.mutable_data<T>({1, M}, place);
-    area1.mutable_data<T>({N, 1}, place);
-    area2.mutable_data<T>({1, M}, place);
-    F.Sub(&xmax1, &xmin1, &w1);
-    F.Sub(&ymax1, &ymin1, &h1);
-    F.Sub(&xmax2, &xmin2, &w2);
-    F.Sub(&ymax2, &ymin2, &h2);
-    if (!normalized) {
-      F.Adds(&w1, 1.0f, &w1);
-      F.Adds(&h1, 1.0f, &h1);
-      F.Adds(&w2, 1.0f, &w2);
-      F.Adds(&h2, 1.0f, &h2);
-    }
-    F.Mul(&w1, &h1, &area1);
-    F.Mul(&w2, &h2, &area2);
-
-    phi::DenseTensor inter_xmax(_type);
-    phi::DenseTensor inter_ymax(_type);
-    phi::DenseTensor inter_xmin(_type);
-    phi::DenseTensor inter_ymin(_type);
-    inter_xmax.mutable_data<T>({N, M}, place);
-    inter_ymax.mutable_data<T>({N, M}, place);
-    inter_xmin.mutable_data<T>({N, M}, place);
-    inter_ymin.mutable_data<T>({N, M}, place);
-    F.Minimum(&xmax1, &xmax2, &inter_xmax);
-    F.Minimum(&ymax1, &ymax2, &inter_ymax);
-    F.Maximum(&xmin1, &xmin2, &inter_xmin);
-    F.Maximum(&ymin1, &ymin2, &inter_ymin);
-
-    phi::DenseTensor inter_w(_type);
-    phi::DenseTensor inter_h(_type);
-    inter_w.mutable_data<T>({N, M}, place);
-    inter_h.mutable_data<T>({N, M}, place);
-    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
-    F.Sub(&inter_ymax, &inter_ymin, &inter_h);
-
-    if (!normalized) {
-      F.Adds(&inter_w, 1.0f, &inter_w);
-      F.Adds(&inter_h, 1.0f, &inter_h);
-    }
-    phi::DenseTensor zeros(_type);
-    zeros.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&zeros, static_cast<T>(0));
-    F.Maximum(&inter_w, &zeros, &inter_w);
-    F.Maximum(&inter_h, &zeros, &inter_h);
-
-    F.Mul(&inter_w, &inter_h, out);
-    phi::DenseTensor union_area(_type);
-    union_area.mutable_data<T>({N, M}, place);
-    F.Add(&area1, &area2, &union_area);
-    F.Sub(&union_area, out, &union_area);
-    F.DivNoNan(out, &union_area, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(iou_similarity,
-                       ops::IouSimilarityNPUKernel<float>,
-                       ops::IouSimilarityNPUKernel<plat::float16>);
--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PriorBoxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* image = ctx.Input<phi::DenseTensor>("Image");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
-
-    PADDLE_ENFORCE_EQ(boxes->dims(),
-                      variances->dims(),
-                      platform::errors::Unimplemented(
-                          "the shape of boxes and variances must be same in "
-                          "the npu kernel of prior_box, but got boxes->dims() "
-                          "= [%s], variances->dims() = [%s]",
-                          boxes->dims(),
-                          variances->dims()));
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
-    bool flip = ctx.Attr<bool>("flip");
-    bool clip = ctx.Attr<bool>("clip");
-    float step_w = ctx.Attr<float>("step_w");
-    float step_h = ctx.Attr<float>("step_h");
-    float offset = ctx.Attr<float>("offset");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor out(input->type());
-    auto out_dims = phi::vectorize(boxes->dims());
-    out_dims.insert(out_dims.begin(), 2);
-    out.Resize(phi::make_ddim(out_dims));
-    out.mutable_data<T>(place);
-
-    framework::NPUAttributeMap attr_input = {{"min_size", min_sizes},
-                                             {"max_size", max_sizes},
-                                             {"aspect_ratio", aspect_ratios},
-                                             {"step_h", step_h},
-                                             {"step_w", step_w},
-                                             {"flip", flip},
-                                             {"clip", clip},
-                                             {"offset", offset},
-                                             {"variance", variances_attr}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("PriorBox", {*input, *image}, {out}, attr_input);
-    runner.Run(stream);
-
-    out.Resize(phi::make_ddim({out.numel()}));
-    phi::DenseTensor out_boxes = out.Slice(0, boxes->numel());
-    phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel());
-
-    out_boxes.Resize(boxes->dims());
-    out_variances.Resize(variances->dims());
-
-    boxes->mutable_data<T>(place);
-    variances->mutable_data<T>(place);
-
-    framework::TensorCopy(
-        out_boxes,
-        place,
-        ctx.template device_context<platform::NPUDeviceContext>(),
-        boxes);
-    framework::TensorCopy(
-        out_variances,
-        place,
-        ctx.template device_context<platform::NPUDeviceContext>(),
-        variances);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    prior_box,
-    ops::PriorBoxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::PriorBoxNPUKernel<plat::NPUDeviceContext, plat::float16>);