[operator migration] Migrate unstack_op and nms_op (#44424)

* update unstack_op * update unstack_op * update unstack_op * fix unstack test * update unstack * update with remote * fix unstack_test.py * temp_save_change_nms_op * add nms test * update nms fix * update unstack_op * temp save change * finish fix nms_op * pass nms test * fix CI * fix ops test * save change * fix code style * fix code style * fix ci and codestyle * fix ci Co-authored-by: N ShiningZhang <zhang_liang1991@126.com>

[operator migration] Migrate unstack_op and nms_op (#44424)
* update unstack_op * update unstack_op * update unstack_op * fix unstack test * update unstack * update with remote * fix unstack_test.py * temp_save_change_nms_op * add nms test * update nms fix * update unstack_op * temp save change * finish fix nms_op * pass nms test * fix CI * fix ops test * save change * fix code style * fix code style * fix ci and codestyle * fix ci Co-authored-by: N ShiningZhang <zhang_liang1991@126.com>
9d2e0ecb · Thomas Young · GitHub · 74e46a93 · 9d2e0ecb · 9d2e0ecb
18 changed file
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -81,7 +81,7 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
                  sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS
                  retinanet_detection_output_op.cc)
-detection_library(nms_op SRCS nms_op.cc nms_op.cu)
+detection_library(nms_op SRCS nms_op.cc)
 if(WITH_GPU OR WITH_ROCM)
  set(TMPDEPS memory)

--- a/paddle/fluid/operators/detection/nms_op.cc
+++ b/paddle/fluid/operators/detection/nms_op.cc
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/detection/nms_op.h"
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -65,23 +69,6 @@ class NMSOpMaker : public framework::OpProtoAndCheckerMaker {
 class NMSOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Boxes"), "Input", "Boxes", "NMS");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("KeepBoxesIdxs"), "Output", "KeepBoxesIdxs", "NMS");
-    auto boxes_dim = ctx->GetInputDim("Boxes");
-    PADDLE_ENFORCE_EQ(boxes_dim.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The Input Boxes must be 2-dimention "
-                          "whose shape must be [N, 4] "
-                          "N is the number of boxes "
-                          "in last dimension in format [x1, x2, y1, y2]. "));
-    auto num_boxes = boxes_dim[0];
-    ctx->SetOutputDim("KeepBoxesIdxs", {num_boxes});
-  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
@@ -92,64 +79,20 @@ class NMSOp : public framework::OperatorWithKernel {
 };
 template <typename T>
-static void NMS(const T* boxes_data,
+class NMSKernel : public framework::OpKernel<T> {};
-                int64_t* output_data,
-                float threshold,
-                int64_t num_boxes) {
-  auto num_masks = CeilDivide(num_boxes, 64);
-  std::vector<uint64_t> masks(num_masks, 0);
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    if (masks[i / 64] & 1ULL << (i % 64)) continue;
-    T box_1[4];
-    for (int k = 0; k < 4; ++k) {
-      box_1[k] = boxes_data[i * 4 + k];
-    }
-    for (int64_t j = i + 1; j < num_boxes; ++j) {
-      if (masks[j / 64] & 1ULL << (j % 64)) continue;
-      T box_2[4];
-      for (int k = 0; k < 4; ++k) {
-        box_2[k] = boxes_data[j * 4 + k];
-      }
-      bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
-      if (is_overlap) {
-        masks[j / 64] |= 1ULL << (j % 64);
-      }
-    }
-  }
-  int64_t output_data_idx = 0;
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    if (masks[i / 64] & 1ULL << (i % 64)) continue;
-    output_data[output_data_idx++] = i;
-  }
-  for (; output_data_idx < num_boxes; ++output_data_idx) {
-    output_data[output_data_idx] = 0;
-  }
-}
-template <typename T>
-class NMSKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* boxes = context.Input<Tensor>("Boxes");
-    Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
-    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    auto threshold = context.template Attr<float>("iou_threshold");
-    NMS<T>(boxes->data<T>(), output_data, threshold, boxes->dims()[0]);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(nms,
+                            NMSInferMetaFunctor,
+                            PD_INFER_META(phi::NMSInferMeta));
 REGISTER_OPERATOR(
    nms,
    ops::NMSOp,
    ops::NMSOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-REGISTER_OP_CPU_KERNEL(nms, ops::NMSKernel<float>, ops::NMSKernel<double>);
+    NMSInferMetaFunctor);
--- a/paddle/fluid/operators/detection/nms_op.cu
+++ b/paddle/fluid/operators/detection/nms_op.cu
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/operators/detection/nms_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-template <typename T>
-static __global__ void NMS(const T* boxes_data,
-                           float threshold,
-                           int64_t num_boxes,
-                           uint64_t* masks) {
-  auto raw_start = blockIdx.y;
-  auto col_start = blockIdx.x;
-  if (raw_start > col_start) return;
-  const int raw_last_storage =
-      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
-  const int col_last_storage =
-      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
-  if (threadIdx.x < raw_last_storage) {
-    uint64_t mask = 0;
-    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
-    const T* current_box = boxes_data + current_box_idx * 4;
-    for (int i = 0; i < col_last_storage; ++i) {
-      const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
-      if (CalculateIoU<T>(current_box, target_box, threshold)) {
-        mask |= 1ULL << i;
-      }
-    }
-    const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
-    masks[current_box_idx * blocks_per_line + col_start] = mask;
-  }
-}
-template <typename T>
-class NMSCudaKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor* boxes = context.Input<Tensor>("Boxes");
-    Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
-    auto* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    auto threshold = context.template Attr<float>("iou_threshold");
-    const int64_t num_boxes = boxes->dims()[0];
-    const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
-    dim3 block(threadsPerBlock);
-    dim3 grid(blocks_per_line, blocks_per_line);
-    auto mask_data =
-        memory::Alloc(context.cuda_device_context(),
-                      num_boxes * blocks_per_line * sizeof(uint64_t));
-    uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
-    NMS<T><<<grid, block, 0, context.cuda_device_context().stream()>>>(
-        boxes->data<T>(), threshold, num_boxes, mask_dev);
-    std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
-    memory::Copy(platform::CPUPlace(),
-                 mask_host.data(),
-                 context.GetPlace(),
-                 mask_dev,
-                 num_boxes * blocks_per_line * sizeof(uint64_t),
-                 context.cuda_device_context().stream());
-    std::vector<int64_t> remv(blocks_per_line);
-    std::vector<int64_t> keep_boxes_idxs(num_boxes);
-    int64_t* output_host = keep_boxes_idxs.data();
-    int64_t last_box_num = 0;
-    for (int64_t i = 0; i < num_boxes; ++i) {
-      auto remv_element_id = i / threadsPerBlock;
-      auto remv_bit_id = i % threadsPerBlock;
-      if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
-        output_host[last_box_num++] = i;
-        uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
-        for (auto j = remv_element_id; j < blocks_per_line; ++j) {
-          remv[j] |= current_mask[j];
-        }
-      }
-    }
-    memory::Copy(context.GetPlace(),
-                 output_data,
-                 platform::CPUPlace(),
-                 output_host,
-                 sizeof(int64_t) * num_boxes,
-                 context.cuda_device_context().stream());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(nms,
-                        ops::NMSCudaKernel<float>,
-                        ops::NMSCudaKernel<double>);
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/unary.h"
 namespace paddle {
@@ -63,51 +64,6 @@ class UnStackGradOpMaker : public framework::SingleGradOpMaker<T> {
 class UnStackGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Inputs(Y@Grad) of unstack operator are empty."));
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X",
-                   "UnStackGrad");
-    auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
-    for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          input_dims[i],
-          input_dims[0],
-          platform::errors::InvalidArgument(
-              "The dimensions of all Inputs(Y@Grad) must be the same,"
-              "but received Inputs(Y@Grad)'s %d-th dimension is %d, "
-              "Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
-              i,
-              input_dims[i],
-              i - 1,
-              input_dims[0]));
-    }
-    int axis = ctx->Attrs().Get<int>("axis");
-    int rank = input_dims[0].size();
-    PADDLE_ENFORCE_GE(axis,
-                      -(rank + 1),
-                      platform::errors::InvalidArgument(
-                          "The attribute axis is out of range, it must be "
-                          "inside [-(rank+1), rank+1), where rank = %d",
-                          rank));
-    PADDLE_ENFORCE_LT(axis,
-                      rank + 1,
-                      platform::errors::InvalidArgument(
-                          "The attribute axis is out of range, it must be "
-                          "inside [-(rank+1), rank+1), where rank = %d",
-                          rank));
-    if (axis < 0) axis += (rank + 1);
-    auto vec = phi::vectorize<int>(input_dims[0]);
-    vec.insert(vec.begin() + axis, input_dims.size());
-    ctx->SetOutputDim(framework::GradVarName("X"), phi::make_ddim(vec));
-  }
 };
 }  // namespace operators
@@ -119,12 +75,15 @@ namespace ops = paddle::operators;
 DECLARE_INFER_SHAPE_FUNCTOR(unstack,
                            UnStackInferMetaFunctor,
                            PD_INFER_META(phi::UnStackInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(unstack_grad,
+                            UnStackGradInferMetaFunctor,
+                            PD_INFER_META(phi::UnStackGradInferMeta));
 REGISTER_OPERATOR(unstack,
                  ops::UnStackOp,
                  ops::UnStackOpMaker,
                  ops::UnStackGradOpMaker<paddle::framework::OpDesc>,
                  ops::UnStackGradOpMaker<paddle::imperative::OpBase>,
                  UnStackInferMetaFunctor);
+REGISTER_OPERATOR(unstack_grad,
-REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp);
+                  ops::UnStackGradOp,
+                  UnStackGradInferMetaFunctor);
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -889,7 +889,7 @@
    func : FrameInferMeta
  kernel :
    func : frame
-  backward : frame_grad 
+  backward : frame_grad
 - api : frobenius_norm
  args : (Tensor x, int64_t[] axis,  bool keep_dim,  bool reduce_all)
@@ -1700,6 +1700,15 @@
  optional : weight
  backward : nll_loss_grad
+- api : nms
+  args : (Tensor x, float threshold)
+  output : Tensor(out)
+  infer_meta :
+    func : NMSInferMeta
+  kernel :
+    func : nms
+    data_type : x
 - api : norm
  args : (Tensor x, int axis, float epsilon, bool is_test)
  output : Tensor(out), Tensor(norm)
@@ -2258,7 +2267,7 @@
  kernel :
    func : spectralnorm
    data_type : weight
-  backward : spectral_norm_grad 
+  backward : spectral_norm_grad
 - api : split
  args : (Tensor x, IntArray num_or_sections, Scalar(int) axis)
@@ -2566,6 +2575,16 @@
  intermediate : xshape
  backward : unsqueeze_grad
+# unstack
+- api : unstack
+  args : (Tensor x, int axis, int num)
+  output : Tensor[]{num}
+  infer_meta :
+    func : UnStackInferMeta
+  kernel :
+    func : unstack
+  backward : unstack_grad
 # viterbi_decode
 - api : viterbi_decode
  args : (Tensor input, Tensor transition, Tensor length, bool include_bos_eos_tag)
@@ -2629,7 +2648,7 @@
  kernel:
    func: broadcast_tensors
  backward: broadcast_tensors_grad
 # dirichlet
 - api: dirichlet
  args: (Tensor alpha)

--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -2499,6 +2499,16 @@
  inplace : (out_grad -> x_grad)
  backward : unsqueeze_double_grad
+- backward_api : unstack_grad
+  forward : unstack (Tensor x, int axis, int num) -> Tensor[](out)
+  args : (Tensor[] out_grad, int axis)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnStackGradInferMeta
+    param : [out_grad, axis]
+  kernel :
+    func : unstack_grad
 - backward_api : warpctc_grad
  forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
  args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)

--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
@@ -787,4 +786,46 @@ void StackGradInferMeta(const MetaTensor& out_grad,
  }
 }
+void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
+                          int axis,
+                          MetaTensor* x_grad) {
+  std::vector<phi::DDim> input_dims(out_grad.size());
+  for (size_t i = 0; i < out_grad.size(); ++i) {
+    input_dims[i] = out_grad[i]->dims();
+  }
+  for (size_t i = 1; i < input_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(
+        input_dims[i],
+        input_dims[0],
+        phi::errors::InvalidArgument(
+            "The dimensions of all Inputs(Y@Grad) must be the same,"
+            "but received Inputs(Y@Grad)'s %d-th dimension is %d, "
+            "Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
+            i,
+            input_dims[i],
+            i - 1,
+            input_dims[0]));
+  }
+  int rank = input_dims[0].size();
+  PADDLE_ENFORCE_GE(axis,
+                    -(rank + 1),
+                    phi::errors::InvalidArgument(
+                        "The attribute axis is out of range, it must be "
+                        "inside [-(rank+1), rank+1), where rank = %d",
+                        rank));
+  PADDLE_ENFORCE_LT(axis,
+                    rank + 1,
+                    phi::errors::InvalidArgument(
+                        "The attribute axis is out of range, it must be "
+                        "inside [-(rank+1), rank+1), where rank = %d",
+                        rank));
+  if (axis < 0) axis += (rank + 1);
+  auto vec = phi::vectorize<int>(input_dims[0]);
+  vec.insert(vec.begin() + axis, input_dims.size());
+  x_grad->set_dims(phi::make_ddim(vec));
+  x_grad->set_dtype(out_grad[0]->dtype());
+}
 }  // namespace phi
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include <tuple>
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
@@ -325,4 +324,8 @@ void StackGradInferMeta(const MetaTensor& out_grad,
                        int axis,
                        std::vector<MetaTensor*> x_grad);
+void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
+                          int axis,
+                          MetaTensor* x_grad);
 }  // namespace phi
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1707,6 +1707,19 @@ void NanmedianInferMeta(const MetaTensor& x,
  out->set_dims(make_ddim(out_dim));
 }
+void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
+  auto boxes_dim = x.dims();
+  PADDLE_ENFORCE_EQ(boxes_dim.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The Input Boxes must be 2-dimention "
+                        "whose shape must be [N, 4] "
+                        "N is the number of boxes "
+                        "in last dimension in format [x1, x2, y1, y2]. "));
+  auto num_boxes = boxes_dim[0];
+  out->set_dims(phi::make_ddim({num_boxes}));
+}
 void NormInferMeta(const MetaTensor& x,
                   int axis,
                   float epsilon,

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -228,6 +228,8 @@ void NanmedianInferMeta(const MetaTensor& x,
                        MetaTensor* out,
                        MetaTensor* median_index);
+void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out);
 void NormInferMeta(const MetaTensor& x,
                   int axis,
                   float epsilon,

--- a/paddle/phi/kernels/cpu/nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/nms_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/nms_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/diagonal.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+namespace phi {
+template <typename T>
+static void NMS(const T* boxes_data,
+                int64_t* output_data,
+                float threshold,
+                int64_t num_boxes) {
+  auto num_masks = CeilDivide(num_boxes, 64);
+  std::vector<uint64_t> masks(num_masks, 0);
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64)) continue;
+    T box_1[4];
+    for (int k = 0; k < 4; ++k) {
+      box_1[k] = boxes_data[i * 4 + k];
+    }
+    for (int64_t j = i + 1; j < num_boxes; ++j) {
+      if (masks[j / 64] & 1ULL << (j % 64)) continue;
+      T box_2[4];
+      for (int k = 0; k < 4; ++k) {
+        box_2[k] = boxes_data[j * 4 + k];
+      }
+      bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
+      if (is_overlap) {
+        masks[j / 64] |= 1ULL << (j % 64);
+      }
+    }
+  }
+  int64_t output_data_idx = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64)) continue;
+    output_data[output_data_idx++] = i;
+  }
+  for (; output_data_idx < num_boxes; ++output_data_idx) {
+    output_data[output_data_idx] = 0;
+  }
+}
+template <typename T, typename Context>
+void NMSKernel(const Context& dev_ctx,
+               const DenseTensor& boxes,
+               float threshold,
+               DenseTensor* output) {
+  auto output_data = dev_ctx.template Alloc<int64_t>(output);
+  NMS<T>(boxes.data<T>(), output_data, threshold, boxes.dims()[0]);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(nms, CPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/nms_kernel.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
+namespace phi {
+template <typename T>
+static __global__ void NMS(const T* boxes_data,
+                           float threshold,
+                           int64_t num_boxes,
+                           uint64_t* masks) {
+  auto raw_start = blockIdx.y;
+  auto col_start = blockIdx.x;
+  if (raw_start > col_start) return;
+  const int raw_last_storage =
+      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
+  const int col_last_storage =
+      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  if (threadIdx.x < raw_last_storage) {
+    uint64_t mask = 0;
+    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
+    const T* current_box = boxes_data + current_box_idx * 4;
+    for (int i = 0; i < col_last_storage; ++i) {
+      const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
+      if (CalculateIoU<T>(current_box, target_box, threshold)) {
+        mask |= 1ULL << i;
+      }
+    }
+    const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
+    masks[current_box_idx * blocks_per_line + col_start] = mask;
+  }
+}
+template <typename T, typename Context>
+void NMSKernel(const Context& dev_ctx,
+               const DenseTensor& boxes,
+               float threshold,
+               DenseTensor* output) {
+  auto* output_data = dev_ctx.template Alloc<int64_t>(output);
+  const int64_t num_boxes = boxes.dims()[0];
+  const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
+  dim3 block(threadsPerBlock);
+  dim3 grid(blocks_per_line, blocks_per_line);
+  auto mask_data = paddle::memory::Alloc(
+      dev_ctx, num_boxes * blocks_per_line * sizeof(uint64_t));
+  uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
+  NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      boxes.data<T>(), threshold, num_boxes, mask_dev);
+  std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       mask_host.data(),
+                       dev_ctx.GetPlace(),
+                       mask_dev,
+                       num_boxes * blocks_per_line * sizeof(uint64_t),
+                       dev_ctx.stream());
+  std::vector<int64_t> remv(blocks_per_line);
+  std::vector<int64_t> keep_boxes_idxs(num_boxes);
+  int64_t* output_host = keep_boxes_idxs.data();
+  int64_t last_box_num = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    auto remv_element_id = i / threadsPerBlock;
+    auto remv_bit_id = i % threadsPerBlock;
+    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
+      output_host[last_box_num++] = i;
+      uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
+      for (auto j = remv_element_id; j < blocks_per_line; ++j) {
+        remv[j] |= current_mask[j];
+      }
+    }
+  }
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       output_data,
+                       phi::CPUPlace(),
+                       output_host,
+                       sizeof(int64_t) * num_boxes,
+                       dev_ctx.stream());
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(nms, GPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
--- a/paddle/fluid/operators/detection/nms_op.h
+++ b/paddle/fluid/operators/detection/nms_op.h
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
 #pragma once
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/hostdevice.h"
-namespace paddle {
+namespace phi {
-namespace operators {
 HOSTDEVICE static inline int64_t CeilDivide(int64_t n, int64_t m) {
  return (n + m - 1) / m;
@@ -48,5 +47,10 @@ HOSTDEVICE inline bool CalculateIoU(const T* const box_1,
  return inter_area / union_area > threshold;
 }
-}  // namespace operators
+template <typename T, typename Context>
-}  // namespace paddle
+void NMSKernel(const Context& dev_ctx,
+               const DenseTensor& boxes,
+               float threshold,
+               DenseTensor* output);
+}  // namespace phi
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10685,6 +10685,7 @@ def unstack(x, axis=0, num=None):
            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
    """
    if _non_static_mode():
        if num == None:
            num = x.shape[axis]

--- a/python/paddle/fluid/tests/unittests/test_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nms_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 def iou(box_a, box_b):
@@ -71,22 +72,25 @@ class TestNMSOp(OpTest):
    def setUp(self):
        self.op_type = 'nms'
+        self.python_api = paddle.vision.ops.nms
        self.dtype = np.float64
        self.init_dtype_type()
        boxes = np.random.rand(32, 4).astype(self.dtype)
        boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
        boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
+        paddle.disable_static()
        self.inputs = {'Boxes': boxes}
        self.attrs = {'iou_threshold': 0.5}
        out_py = nms(boxes, self.attrs['iou_threshold'])
        self.outputs = {'KeepBoxesIdxs': out_py}
+        paddle.enable_static()
    def init_dtype_type(self):
        pass
    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_unstack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py
@@ -15,6 +15,7 @@
 from op_test import OpTest
 import numpy as np
 import unittest
+import paddle
 class TestUnStackOpBase(OpTest):
@@ -37,6 +38,7 @@ class TestUnStackOpBase(OpTest):
        self.initDefaultParameters()
        self.initParameters()
        self.op_type = 'unstack'
+        self.python_api = paddle.unstack
        self.x = np.random.random(size=self.input_dim).astype(self.dtype)
        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
@@ -44,18 +46,21 @@ class TestUnStackOpBase(OpTest):
        del new_shape[self.axis]
        y_names = self.get_y_names()
        tmp = []
+        tmp_names = []
        for i in range(self.input_dim[self.axis]):
            tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
+            tmp_names.append(y_names[i])
+        self.python_out_sig = tmp_names
        self.inputs = {'X': self.x}
        self.outputs = {'Y': tmp}
        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
    def test_check_grad(self):
-        self.check_grad(['X'], self.get_y_names())
+        self.check_grad(['X'], self.get_y_names(), check_eager=True)
 class TestStackOp3(TestUnStackOpBase):

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -454,6 +454,13 @@ def unstack(x, axis=0, num=None):
            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
    """
+    if in_dygraph_mode():
+        if num == None:
+            num = x.shape[axis]
+        if num == 0:
+            return []
+        return _C_ops.final_state_unstack(x, axis, num)
    if _non_static_mode():
        if num == None:
            num = x.shape[axis]

--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -1579,6 +1579,9 @@ def nms(boxes,
    """
    def _nms(boxes, iou_threshold):
+        if in_dygraph_mode():
+            return _C_ops.final_state_nms(boxes, iou_threshold)
        if _non_static_mode():
            return _C_ops.nms(boxes, 'iou_threshold', iou_threshold)