transfer op multiclass_nms3 to phi (#44765)

* add cmake enforce * transfer multiclass_nms3 to phi

transfer op multiclass_nms3 to phi (#44765)
* add cmake enforce * transfer multiclass_nms3 to phi
15ce2c1b · zhiboniu · GitHub · 02414aac · 15ce2c1b · 15ce2c1b
16 changed file
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -123,9 +123,5 @@ cc_test(
  mask_util_test
  SRCS mask_util_test.cc
  DEPS memory mask_util)
-cc_library(
-  gpc
-  SRCS gpc.cc
-  DEPS op_registry)
 detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS
                  mask_util)
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -13,8 +13,10 @@ limitations under the License. */
 #include <glog/logging.h>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
+#include "paddle/phi/infermeta/ternary.h"
 namespace paddle {
 namespace operators {
@@ -609,12 +611,6 @@ class MultiClassNMS3Op : public MultiClassNMS2Op {
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs)
      : MultiClassNMS2Op(type, inputs, outputs, attrs) {}
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    MultiClassNMS2Op::InferShape(ctx);
-    ctx->SetOutputDim("NmsRoisNum", {-1});
-  }
 };
 class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
@@ -633,6 +629,10 @@ class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
 }  // namespace operators
 }  // namespace paddle
+DECLARE_INFER_SHAPE_FUNCTOR(multiclass_nms3,
+                            MultiClassNMSShapeFunctor,
+                            PD_INFER_META(phi::MultiClassNMSInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
    multiclass_nms,
@@ -658,7 +658,5 @@ REGISTER_OPERATOR(
    ops::MultiClassNMS3Op,
    ops::MultiClassNMS3OpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-REGISTER_OP_CPU_KERNEL(multiclass_nms3,
+    MultiClassNMSShapeFunctor);
-                       ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
--- a/paddle/fluid/operators/detection/poly_util.cc
+++ b/paddle/fluid/operators/detection/poly_util.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using gpc::gpc_free_polygon;
+using phi::funcs::gpc_free_polygon;
-using gpc::gpc_polygon_clip;
+using phi::funcs::gpc_polygon_clip;
 template <class T>
 void Array2PointVec(const T* box,
@@ -37,15 +37,18 @@ void Array2PointVec(const T* box,
 }
 template <class T>
-void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
+void Array2Poly(const T* box,
+                const size_t box_size,
+                phi::funcs::gpc_polygon* poly) {
  size_t pts_num = box_size / 2;
  (*poly).num_contours = 1;
  (*poly).hole = reinterpret_cast<int*>(malloc(sizeof(int)));
  (*poly).hole[0] = 0;
-  (*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
+  (*poly).contour =
+      (phi::funcs::gpc_vertex_list*)malloc(sizeof(phi::funcs::gpc_vertex_list));
  (*poly).contour->num_vertices = pts_num;
  (*poly).contour->vertex =
-      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
+      (phi::funcs::gpc_vertex*)malloc(sizeof(phi::funcs::gpc_vertex) * pts_num);
  for (size_t i = 0; i < pts_num; ++i) {
    (*poly).contour->vertex[i].x = box[2 * i];
    (*poly).contour->vertex[i].y = box[2 * i + 1];
@@ -53,15 +56,17 @@ void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
 }
 template <class T>
-void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly) {
+void PointVec2Poly(const std::vector<Point_<T>>& vec,
+                   phi::funcs::gpc_polygon* poly) {
  int pts_num = vec.size();
  (*poly).num_contours = 1;
  (*poly).hole = reinterpret_cast<int*>(malloc(sizeof(int)));
  (*poly).hole[0] = 0;
-  (*poly).contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
+  (*poly).contour =
+      (phi::funcs::gpc_vertex_list*)malloc(sizeof(phi::funcs::gpc_vertex_list));
  (*poly).contour->num_vertices = pts_num;
  (*poly).contour->vertex =
-      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
+      (phi::funcs::gpc_vertex*)malloc(sizeof(phi::funcs::gpc_vertex) * pts_num);
  for (size_t i = 0; i < pts_num; ++i) {
    (*poly).contour->vertex[i].x = vec[i].x;
    (*poly).contour->vertex[i].y = vec[i].y;
@@ -69,7 +74,7 @@ void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly) {
 }
 template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+void Poly2PointVec(const phi::funcs::gpc_vertex_list& contour,
                   std::vector<Point_<T>>* vec) {
  int pts_num = contour.num_vertices;
  (*vec).resize(pts_num);
@@ -105,13 +110,13 @@ T PolyOverlapArea(const T* box1,
                  const T* box2,
                  const size_t box_size,
                  const bool normalized) {
-  gpc::gpc_polygon poly1;
+  phi::funcs::gpc_polygon poly1;
-  gpc::gpc_polygon poly2;
+  phi::funcs::gpc_polygon poly2;
  Array2Poly<T>(box1, box_size, &poly1);
  Array2Poly<T>(box2, box_size, &poly2);
-  gpc::gpc_polygon respoly;
+  phi::funcs::gpc_polygon respoly;
-  gpc::gpc_op op = gpc::GPC_INT;
+  phi::funcs::gpc_op op = phi::funcs::GPC_INT;
-  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
+  phi::funcs::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
  T inter_area = T(0.);
  int contour_num = respoly.num_contours;
@@ -123,9 +128,9 @@ T PolyOverlapArea(const T* box1,
    inter_area += GetContourArea<T>(resvec);
  }
-  gpc::gpc_free_polygon(&poly1);
+  phi::funcs::gpc_free_polygon(&poly1);
-  gpc::gpc_free_polygon(&poly2);
+  phi::funcs::gpc_free_polygon(&poly2);
-  gpc::gpc_free_polygon(&respoly);
+  phi::funcs::gpc_free_polygon(&respoly);
  return inter_area;
 }

--- a/paddle/fluid/operators/detection/poly_util.h
+++ b/paddle/fluid/operators/detection/poly_util.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detection/gpc.h"
+#include "paddle/phi/kernels/funcs/gpc.h"
 namespace paddle {
 namespace operators {
@@ -47,13 +47,16 @@ void Array2PointVec(const T* box,
                    std::vector<Point_<T>>* vec);
 template <class T>
-void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly);
+void Array2Poly(const T* box,
+                const size_t box_size,
+                phi::funcs::gpc_polygon* poly);
 template <class T>
-void PointVec2Poly(const std::vector<Point_<T>>& vec, gpc::gpc_polygon* poly);
+void PointVec2Poly(const std::vector<Point_<T>>& vec,
+                   phi::funcs::gpc_polygon* poly);
 template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
+void Poly2PointVec(const phi::funcs::gpc_vertex_list& contour,
                   std::vector<Point_<T>>* vec);
 template <class T>

--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -1670,6 +1670,15 @@
    func : multi_dot
  backward : multi_dot_grad
+- api : multiclass_nms3
+  args : (Tensor bboxes, Tensor scores, Tensor rois_num, float score_threshold, int nms_top_k, int keep_top_k, float nms_threshold=0.3, bool normalized=true, float nms_eta=1.0, int background_label=0)
+  output : Tensor(out), Tensor(index), Tensor(nms_rois_num)
+  infer_meta :
+    func : MultiClassNMSInferMeta
+  kernel :
+    func : multiclass_nms3
+  optional : rois_num
 # multinomial
 - api : multinomial
  args : (Tensor x, int num_samples, bool replacement)

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -743,6 +743,99 @@ void LinspaceInferMeta(const MetaTensor& start,
  LinspaceRawInferMeta(start, stop, number, out);
 }
+void MultiClassNMSInferMeta(const MetaTensor& bboxes,
+                            const MetaTensor& scores,
+                            const MetaTensor& rois_num,
+                            float score_threshold,
+                            int nms_top_k,
+                            int keep_top_k,
+                            float nms_threshold,
+                            bool normalized,
+                            float nms_eta,
+                            int background_label,
+                            MetaTensor* out,
+                            MetaTensor* index,
+                            MetaTensor* nms_rois_num,
+                            MetaConfig config) {
+  auto box_dims = bboxes.dims();
+  auto score_dims = scores.dims();
+  auto score_size = score_dims.size();
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(
+        score_size == 2 || score_size == 3,
+        true,
+        errors::InvalidArgument("The rank of Input(Scores) must be 2 or 3"
+                                ". But received rank = %d",
+                                score_size));
+    PADDLE_ENFORCE_EQ(
+        box_dims.size(),
+        3,
+        errors::InvalidArgument("The rank of Input(BBoxes) must be 3"
+                                ". But received rank = %d",
+                                box_dims.size()));
+    if (score_size == 3) {
+      PADDLE_ENFORCE_EQ(box_dims[2] == 4 || box_dims[2] == 8 ||
+                            box_dims[2] == 16 || box_dims[2] == 24 ||
+                            box_dims[2] == 32,
+                        true,
+                        errors::InvalidArgument(
+                            "The last dimension of Input"
+                            "(BBoxes) must be 4 or 8, "
+                            "represents the layout of coordinate "
+                            "[xmin, ymin, xmax, ymax] or "
+                            "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or "
+                            "8 points: [xi, yi] i= 1,2,...,8 or "
+                            "12 points: [xi, yi] i= 1,2,...,12 or "
+                            "16 points: [xi, yi] i= 1,2,...,16"));
+      PADDLE_ENFORCE_EQ(
+          box_dims[1],
+          score_dims[2],
+          errors::InvalidArgument(
+              "The 2nd dimension of Input(BBoxes) must be equal to "
+              "last dimension of Input(Scores), which represents the "
+              "predicted bboxes."
+              "But received box_dims[1](%s) != socre_dims[2](%s)",
+              box_dims[1],
+              score_dims[2]));
+    } else {
+      PADDLE_ENFORCE_EQ(box_dims[2],
+                        4,
+                        errors::InvalidArgument(
+                            "The last dimension of Input"
+                            "(BBoxes) must be 4. But received dimension = %d",
+                            box_dims[2]));
+      PADDLE_ENFORCE_EQ(
+          box_dims[1],
+          score_dims[1],
+          errors::InvalidArgument(
+              "The 2nd dimension of Input"
+              "(BBoxes) must be equal to the 2nd dimension of Input(Scores). "
+              "But received box dimension = %d, score dimension = %d",
+              box_dims[1],
+              score_dims[1]));
+    }
+  }
+  PADDLE_ENFORCE_NE(out,
+                    nullptr,
+                    errors::InvalidArgument(
+                        "The out in MultiClassNMSInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      index,
+      nullptr,
+      errors::InvalidArgument(
+          "The index in MultiClassNMSInferMeta can't be nullptr."));
+  // Here the box_dims[0] is not the real dimension of output.
+  // It will be rewritten in the computing kernel.
+  out->set_dims(phi::make_ddim({-1, box_dims[2] + 2}));
+  out->set_dtype(bboxes.dtype());
+  index->set_dims(phi::make_ddim({-1, box_dims[2] + 2}));
+  index->set_dtype(DataType::INT32);
+  nms_rois_num->set_dims(phi::make_ddim({-1}));
+  nms_rois_num->set_dtype(DataType::INT32);
+}
 void NllLossRawInferMeta(const MetaTensor& input,
                         const MetaTensor& label,
                         const MetaTensor& weight,

--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -123,6 +123,21 @@ void LinspaceInferMeta(const MetaTensor& start,
                       DataType dtype,
                       MetaTensor* out);
+void MultiClassNMSInferMeta(const MetaTensor& bboxes,
+                            const MetaTensor& scores,
+                            const MetaTensor& rois_num,
+                            float score_threshold,
+                            int nms_top_k,
+                            int keep_top_k,
+                            float nms_threshold,
+                            bool normalized,
+                            float nms_eta,
+                            int background_label,
+                            MetaTensor* out,
+                            MetaTensor* index,
+                            MetaTensor* nms_rois_num,
+                            MetaConfig config = MetaConfig());
 void NllLossRawInferMeta(const MetaTensor& input,
                         const MetaTensor& label,
                         const MetaTensor& weight,

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -80,6 +80,7 @@ set(COMMON_KERNEL_DEPS
    lod_utils
    custom_kernel
    string_infermeta
+    gpc
    utf8proc)
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})

--- a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -6,6 +6,7 @@ add_subdirectory(detail)
 math_library(deformable_conv_functor DEPS dense_tensor)
 math_library(concat_and_split_functor DEPS dense_tensor)
 math_library(fc_functor DEPS blas jit_kernel_helper)
+math_library(gpc DEPS phi_enforce)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 math_library(math_function DEPS blas dense_tensor tensor)

--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -23,11 +23,12 @@
 * @date 2018/6/12
 **/
-#include "paddle/fluid/operators/detection/gpc.h"
+#include "paddle/phi/kernels/funcs/gpc.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/enforce.h"
-namespace gpc {
+namespace phi {
+namespace funcs {
 typedef struct lmt_shape { /* Local minima table                */
  double y;                /* Y coordinate at local minimum     */
@@ -541,9 +542,8 @@ static int count_contours(polygon_node *polygon) {
 }
 static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p,
+  PADDLE_ENFORCE_NOT_NULL(
-                          paddle::platform::errors::InvalidArgument(
+      p, phi::errors::InvalidArgument("Input polygon node is nullptr."));
-                              "Input polygon node is nullptr."));
  vertex_node *nv = NULL;
  /* Create a new vertex node and set its fields */
@@ -599,9 +599,8 @@ static void add_right(polygon_node *p, double x, double y) {
 }
 static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p,
+  PADDLE_ENFORCE_NOT_NULL(
-                          paddle::platform::errors::InvalidArgument(
+      p, phi::errors::InvalidArgument("Input polygon node is nullptr."));
-                              "Input polygon node is nullptr."));
  polygon_node *target = NULL;
  /* Label contour as external */
@@ -681,8 +680,7 @@ void add_vertex(vertex_node **t, double x, double y) {
 void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
  PADDLE_ENFORCE_NOT_NULL(
-      e,
+      e, phi::errors::InvalidArgument("Input edge node is nullptr."));
-      paddle::platform::errors::InvalidArgument("Input edge node is nullptr."));
  add_vertex(&(e->outp[p]->v[s]), x, y);
  e->outp[p]->active++;
 }
@@ -715,9 +713,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
  gpc_malloc<bbox>(box,
                   p->num_contours * sizeof(bbox),
                   const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box,
+  PADDLE_ENFORCE_NOT_NULL(
-                          paddle::platform::errors::ResourceExhausted(
+      box, phi::errors::ResourceExhausted("Failed to malloc box memory."));
-                              "Failed to malloc box memory."));
  /* Construct contour bounding boxes */
  for (c = 0; c < p->num_contours; c++) {
@@ -882,9 +879,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
  gpc_malloc<int>(extended_hole,
                  (p->num_contours + 1) * sizeof(int),
                  const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole,
+  PADDLE_ENFORCE_NOT_NULL(
-                          paddle::platform::errors::ResourceExhausted(
+      extended_hole,
-                              "Failed to malloc extended hole memory."));
+      phi::errors::ResourceExhausted("Failed to malloc extended hole memory."));
  /* Create an extended contour array */
  gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -1005,7 +1002,7 @@ void gpc_polygon_clip(gpc_op op,
  gpc_malloc<double>(
      sbt, sbt_entries * sizeof(double), const_cast<char *>("sbt creation"));
  PADDLE_ENFORCE_NOT_NULL(sbt,
-                          paddle::platform::errors::ResourceExhausted(
+                          phi::errors::ResourceExhausted(
                              "Failed to malloc scanbeam table memory."));
  build_sbt(&scanbeam, sbt, sbtree);
@@ -1050,8 +1047,7 @@ void gpc_polygon_clip(gpc_op op,
    e1 = aet;
    /* Set up bundle fields of first edge */
    PADDLE_ENFORCE_NOT_NULL(
-        aet,
+        aet, phi::errors::InvalidArgument("Edge node AET is nullptr."));
-        paddle::platform::errors::InvalidArgument("Edge node AET is nullptr."));
    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
    aet->bundle[ABOVE][!aet->type] = 0;
@@ -1651,7 +1647,7 @@ void gpc_tristrip_clip(gpc_op op,
  gpc_malloc<double>(
      sbt, sbt_entries * sizeof(double), const_cast<char *>("sbt creation"));
  PADDLE_ENFORCE_NOT_NULL(sbt,
-                          paddle::platform::errors::ResourceExhausted(
+                          phi::errors::ResourceExhausted(
                              "Failed to malloc scanbeam table memory."));
  build_sbt(&scanbeam, sbt, sbtree);
  scanbeam = 0;
@@ -1691,8 +1687,7 @@ void gpc_tristrip_clip(gpc_op op,
    /* Set up bundle fields of first edge */
    PADDLE_ENFORCE_NOT_NULL(
-        aet,
+        aet, phi::errors::InvalidArgument("Edge node AET is nullptr."));
-        paddle::platform::errors::InvalidArgument("Edge node AET is nullptr."));
    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
    aet->bundle[ABOVE][!aet->type] = 0;
    aet->bstate[ABOVE] = UNBUNDLED;
@@ -2248,6 +2243,7 @@ void gpc_tristrip_clip(gpc_op op,
  gpc_free<double>(sbt);
 }  // NOLINT
-}  // namespace gpc
+}  // namespace funcs
+}  // namespace phi
 /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
--- a/paddle/fluid/operators/detection/gpc.h
+++ b/paddle/fluid/operators/detection/gpc.h
@@ -29,15 +29,16 @@
 * @date 2018/6/12
 **/
-#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
+#ifndef PADDLE_PHI_KERNELS_FUNCS_GPC_H_  // GPC_H_
-#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_  // GPC_H_
+#define PADDLE_PHI_KERNELS_FUNCS_GPC_H_  // GPC_H_
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-namespace gpc {
+namespace phi {
+namespace funcs {
 typedef enum {  // Set operation type
  GPC_DIFF,     // Difference
@@ -190,7 +191,7 @@ inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
 template <typename T>
 void gpc_malloc(T *&p, int b, char *s) {
  if (b > 0) {
-    p = (T *)malloc(b);
+    p = reinterpret_cast<T *>(malloc(b));
    if (!p) {
      fprintf(stderr, "gpc malloc failure: %s\n", s);
@@ -243,7 +244,8 @@ void gpc_free_polygon(gpc_polygon *polygon);
 void gpc_free_tristrip(gpc_tristrip *tristrip);
-}  // namespace gpc
+}  // namespace funcs
+}  // namespace phi
-#endif  // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_
+#endif  // PADDLE_PHI_KERNELS_FUNCS_GPC_H_
 /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
--- a/paddle/phi/kernels/multiclass_nms3_kernel.h
+++ b/paddle/phi/kernels/multiclass_nms3_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void MultiClassNMSKernel(const Context& ctx,
+                         const DenseTensor& bboxes,
+                         const DenseTensor& scores,
+                         const paddle::optional<DenseTensor>& rois_num,
+                         float score_threshold,
+                         int nms_top_k,
+                         int keep_top_k,
+                         float nms_threshold,
+                         bool normalized,
+                         float nms_eta,
+                         int background_label,
+                         DenseTensor* out,
+                         DenseTensor* index,
+                         DenseTensor* nms_rois_num);
+}  // namespace phi
--- a/paddle/phi/ops/compat/multiclass_nms3_sig.cc
+++ b/paddle/phi/ops/compat/multiclass_nms3_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature MultiClassNMS3OpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("multiclass_nms3",
+                         {"BBoxes", "Scores", "RoisNum"},
+                         {"score_threshold",
+                          "nms_top_k",
+                          "keep_top_k",
+                          "nms_threshold",
+                          "normalized",
+                          "nms_eta",
+                          "background_label"},
+                         {"Out", "Index", "NmsRoisNum"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(multiclass_nms3,
+                           phi::MultiClassNMS3OpArgumentMapping);
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1457,6 +1457,7 @@ class OpTest(unittest.TestCase):
                # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
                if expect_np.size == 0:
                    self.op_test.assertTrue(actual_np.size == 0)  # }}}
+                # print("actual_np, expect_np", actual_np, expect_np)
                self._compare_numpy(name, actual_np, expect_np)
                if isinstance(expect, tuple):
                    self._compare_list(name, actual, expect)

--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -19,7 +19,81 @@ import copy
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
+from paddle.fluid import Program, program_guard, in_dygraph_mode, _non_static_mode
+from paddle.fluid.layer_helper import LayerHelper
+from paddle import _C_ops
+def multiclass_nms3(bboxes,
+                    scores,
+                    rois_num=None,
+                    score_threshold=0.3,
+                    nms_top_k=1000,
+                    keep_top_k=100,
+                    nms_threshold=0.3,
+                    normalized=True,
+                    nms_eta=1.,
+                    background_label=-1,
+                    return_index=True,
+                    return_rois_num=True,
+                    name=None):
+    helper = LayerHelper('multiclass_nms3', **locals())
+    if in_dygraph_mode():
+        attrs = (score_threshold, nms_top_k, keep_top_k, nms_threshold,
+                 normalized, nms_eta, background_label)
+        output, index, nms_rois_num = _C_ops.final_state_multiclass_nms3(
+            bboxes, scores, rois_num, *attrs)
+        if not return_index:
+            index = None
+        return output, index, nms_rois_num
+    elif _non_static_mode():
+        attrs = ('background_label', background_label, 'score_threshold',
+                 score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
+                 nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
+                 'normalized', normalized)
+        output, index, nms_rois_num = _C_ops.multiclass_nms3(
+            bboxes, scores, rois_num, *attrs)
+        if not return_index:
+            index = None
+        return output, index, nms_rois_num
+    else:
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype='int32')
+        inputs = {'BBoxes': bboxes, 'Scores': scores}
+        outputs = {'Out': output, 'Index': index}
+        if rois_num is not None:
+            inputs['RoisNum'] = rois_num
+        if return_rois_num:
+            nms_rois_num = helper.create_variable_for_type_inference(
+                dtype='int32')
+            outputs['NmsRoisNum'] = nms_rois_num
+        helper.append_op(type="multiclass_nms3",
+                         inputs=inputs,
+                         attrs={
+                             'background_label': background_label,
+                             'score_threshold': score_threshold,
+                             'nms_top_k': nms_top_k,
+                             'nms_threshold': nms_threshold,
+                             'keep_top_k': keep_top_k,
+                             'nms_eta': nms_eta,
+                             'normalized': normalized
+                         },
+                         outputs=outputs)
+        output.stop_gradient = True
+        index.stop_gradient = True
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            nms_rois_num = None
+        return output, nms_rois_num, index
 def softmax(x):
@@ -541,8 +615,9 @@ class TestMulticlassNMS2LoDInput(TestMulticlassNMSLoDInput):
            'normalized': normalized,
        }
-    def test_check_output(self):
-        self.check_output()
+def test_check_output(self):
+    self.check_output()
 class TestMulticlassNMS2LoDNoOutput(TestMulticlassNMS2LoDInput):
@@ -590,6 +665,7 @@ class TestMulticlassNMSError(unittest.TestCase):
 class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
    def setUp(self):
+        self.python_api = multiclass_nms3
        self.set_argument()
        N = 7
        M = 1200
@@ -623,8 +699,8 @@ class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
        self.op_type = 'multiclass_nms3'
        self.inputs = {'BBoxes': boxes, 'Scores': scores}
        self.outputs = {
-            'Out': (nmsed_outs, [lod]),
+            'Out': nmsed_outs,
-            'Index': (index_outs, [lod]),
+            'Index': index_outs,
            'NmsRoisNum': np.array(lod).astype('int32')
        }
        self.attrs = {
@@ -638,7 +714,7 @@ class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 class TestMulticlassNMS3OpNoOutput(TestMulticlassNMS3Op):
@@ -649,71 +725,6 @@ class TestMulticlassNMS3OpNoOutput(TestMulticlassNMS3Op):
        self.score_threshold = 2.0
-class TestMulticlassNMS3LoDInput(TestMulticlassNMS2LoDInput):
-    def setUp(self):
-        self.set_argument()
-        M = 1200
-        C = 21
-        BOX_SIZE = 4
-        box_lod = [[1200]]
-        background = 0
-        nms_threshold = 0.3
-        nms_top_k = 400
-        keep_top_k = 200
-        score_threshold = self.score_threshold
-        normalized = False
-        scores = np.random.random((M, C)).astype('float32')
-        scores = np.apply_along_axis(softmax, 1, scores)
-        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
-        boxes[:, :, 0] = boxes[:, :, 0] * 10
-        boxes[:, :, 1] = boxes[:, :, 1] * 10
-        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
-        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
-        det_outs, lod = lod_multiclass_nms(boxes, scores, background,
-                                           score_threshold, nms_threshold,
-                                           nms_top_k, keep_top_k, box_lod,
-                                           normalized)
-        det_outs = np.array(det_outs)
-        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
-            det_outs) else det_outs
-        self.op_type = 'multiclass_nms3'
-        self.inputs = {
-            'BBoxes': (boxes, box_lod),
-            'Scores': (scores, box_lod),
-            'RoisNum': np.array(box_lod).astype('int32')
-        }
-        self.outputs = {
-            'Out': (nmsed_outs, [lod]),
-            'NmsRoisNum': np.array(lod).astype('int32')
-        }
-        self.attrs = {
-            'background_label': 0,
-            'nms_threshold': nms_threshold,
-            'nms_top_k': nms_top_k,
-            'keep_top_k': keep_top_k,
-            'score_threshold': score_threshold,
-            'nms_eta': 1.0,
-            'normalized': normalized,
-        }
-    def test_check_output(self):
-        self.check_output()
-class TestMulticlassNMS3LoDNoOutput(TestMulticlassNMS3LoDInput):
-    def set_argument(self):
-        # Here set 2.0 to test the case there is no outputs.
-        # In practical use, 0.0 < score_threshold < 1.0
-        self.score_threshold = 2.0
 if __name__ == '__main__':
    paddle.enable_static()
    unittest.main()