diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 0e835a62839b4bd0c12a1e2c7c6ec72746b48b0a..7927410ef37862499aadf61d6e04c45af157f347 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,6 +21,16 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
+inline std::vector<size_t> GetNmsLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 class MultiClassNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -321,6 +331,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto* outs = ctx.Output<LoDTensor>("Out");
     bool return_index = ctx.HasOutput("Index") ? true : false;
     auto index = ctx.Output<LoDTensor>("Index");
+    bool has_roisnum = ctx.HasInput("RoisNum") ? true : false;
+    auto rois_num = ctx.Input<Tensor>("RoisNum");
     auto score_dims = scores->dims();
     auto score_size = score_dims.size();
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
@@ -332,7 +344,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
     Tensor boxes_slice, scores_slice;
-    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    int n = 0;
+    if (has_roisnum) {
+      n = score_size == 3 ? batch_size : rois_num->numel();
+    } else {
+      n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    }
     for (int i = 0; i < n; ++i) {
       std::map<int, std::vector<int>> indices;
       if (score_size == 3) {
@@ -341,7 +358,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         boxes_slice = boxes->Slice(i, i + 1);
         boxes_slice.Resize({score_dims[2], box_dim});
       } else {
-        auto boxes_lod = boxes->lod().back();
+        std::vector<size_t> boxes_lod;
+        if (has_roisnum) {
+          boxes_lod = GetNmsLodFromRoisNum(rois_num);
+        } else {
+          boxes_lod = boxes->lod().back();
+        }
         if (boxes_lod[i] == boxes_lod[i + 1]) {
           all_indices.push_back(indices);
           batch_starts.push_back(batch_starts.back());
@@ -380,7 +402,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
             offset = i * score_dims[2];
           }
         } else {
-          auto boxes_lod = boxes->lod().back();
+          std::vector<size_t> boxes_lod;
+          if (has_roisnum) {
+            boxes_lod = GetNmsLodFromRoisNum(rois_num);
+          } else {
+            boxes_lod = boxes->lod().back();
+          }
           if (boxes_lod[i] == boxes_lod[i + 1]) continue;
           scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
           boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
@@ -403,6 +430,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         }
       }
     }
+    if (ctx.HasOutput("NmsRoisNum")) {
+      auto* nms_rois_num = ctx.Output<Tensor>("NmsRoisNum");
+      nms_rois_num->mutable_data<int>({n}, ctx.GetPlace());
+      int* num_data = nms_rois_num->data<int>();
+      for (int i = 1; i <= n; i++) {
+        num_data[i - 1] = batch_starts[i] - batch_starts[i - 1];
+      }
+      nms_rois_num->Resize({n});
+    }
 
     framework::LoD lod;
     lod.emplace_back(batch_starts);
@@ -535,6 +571,34 @@ class MultiClassNMS2OpMaker : public MultiClassNMSOpMaker {
   }
 };
 
+class MultiClassNMS3Op : public MultiClassNMS2Op {
+ public:
+  MultiClassNMS3Op(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : MultiClassNMS2Op(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    MultiClassNMS2Op::InferShape(ctx);
+
+    ctx->SetOutputDim("NmsRoisNum", {-1});
+  }
+};
+
+class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
+ public:
+  void Make() override {
+    MultiClassNMS2OpMaker::Make();
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
+    AddOutput("NmsRoisNum", "(Tensor), The number of NMS RoIs in each image")
+        .AsDispensable();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -551,3 +615,10 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(multiclass_nms2, ops::MultiClassNMSKernel<float>,
                        ops::MultiClassNMSKernel<double>);
+
+REGISTER_OPERATOR(
+    multiclass_nms3, ops::MultiClassNMS3Op, ops::MultiClassNMS3OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(multiclass_nms3, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 7f2736a9b1d4149f54c8eb516d1884141c90d446..cac44173c17727158011674bbd4ac4a9b3cca05a 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -52,6 +52,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"hierarchical_sigmoid",
      {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
     {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
+    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -78,6 +79,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
     {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 34c19b88bcdbacbf8dd861be2b83a30dedc687c4..3158d78db63dcd2889dc84cad3937453d0738fc8 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -571,6 +571,128 @@ class TestMulticlassNMSError(unittest.TestCase):
             self.assertRaises(TypeError, test_scores_Variable)
 
 
+class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
+    def setUp(self):
+        self.set_argument()
+        N = 7
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
+        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
+        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
+
+        det_outs, lod = batched_multiclass_nms(boxes, scores, background,
+                                               score_threshold, nms_threshold,
+                                               nms_top_k, keep_top_k)
+        det_outs = np.array(det_outs)
+
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        index_outs = det_outs[:, -1:].astype('int') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms3'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'Index': (index_outs, [lod]),
+            'NmsRoisNum': np.array(lod).astype('int32')
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': True,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMS3OpNoOutput(TestMulticlassNMS3Op):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0
+        self.score_threshold = 2.0
+
+
+class TestMulticlassNMS3LoDInput(TestMulticlassNMS2LoDInput):
+    def setUp(self):
+        self.set_argument()
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        box_lod = [[1200]]
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+        normalized = False
+
+        scores = np.random.random((M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+
+        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0] = boxes[:, :, 0] * 10
+        boxes[:, :, 1] = boxes[:, :, 1] * 10
+        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
+        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
+
+        det_outs, lod = lod_multiclass_nms(
+            boxes, scores, background, score_threshold, nms_threshold,
+            nms_top_k, keep_top_k, box_lod, normalized)
+
+        det_outs = np.array(det_outs)
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms3'
+        self.inputs = {
+            'BBoxes': (boxes, box_lod),
+            'Scores': (scores, box_lod),
+            'RoisNum': np.array(box_lod).astype('int32')
+        }
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'NmsRoisNum': np.array(lod).astype('int32')
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': normalized,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMS3LoDNoOutput(TestMulticlassNMS3LoDInput):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0
+        self.score_threshold = 2.0
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d2a45c248ce271c1c4fff310505a172339e5eee
Binary files /dev/null and b/tools/static_mode_white_list.pyc differ