update roi extractor & post_process (#1664)

06e6afcf · wangguanzhong · GitHub · 8a878423 · 06e6afcf · 06e6afcf
14 changed file
--- a/configs/mask_rcnn_r50_fpn_1x.yml
+++ b/configs/mask_rcnn_r50_fpn_1x.yml
@@ -13,7 +13,7 @@ load_static_weights: True
 # Model Achitecture
 MaskRCNN:
  # model anchor info flow
-  anchor: AnchorRPN
+  anchor: Anchor
  proposal: Proposal
  mask: Mask
  # model feat info flow
@@ -22,6 +22,9 @@ MaskRCNN:
  rpn_head: RPNHead
  bbox_head: BBoxHead
  mask_head: MaskHead
+  # post process
+  bbox_post_process: BBoxPostProcess
+  mask_post_process: MaskPostProcess

 ResNet:
  # index 0 stands for res2
@@ -38,7 +41,6 @@ FPN:
  max_level: 4
  spatial_scale: [0.25, 0.125, 0.0625, 0.03125]

-
 RPNHead:
  rpn_feat:
    name: RPNFeat
@@ -47,33 +49,7 @@ RPNHead:
  anchor_per_position: 3
  rpn_channel: 256

-BBoxHead:
-  bbox_feat:
-    name: BBoxFeat
-    roi_extractor:
-      name: RoIExtractor
-      resolution: 7
-      sampling_ratio: 2
-    head_feat:
-      name: TwoFCHead
-      in_dim: 256
-      mlp_dim: 1024
-  in_feat: 1024
-
-MaskHead:
-  mask_feat:
-    name: MaskFeat
-    num_convs: 4
-    feat_in: 256
-    feat_out: 256
-    mask_roi_extractor:
-      name: RoIExtractor
-      resolution: 14
-      sampling_ratio: 2
-    share_bbox_feat: False
-  feat_in: 256
-
-AnchorRPN:
+Anchor:
  anchor_generator:
    name: AnchorGeneratorRPN
    aspect_ratios: [0.5, 1.0, 2.0]
@@ -104,22 +80,52 @@ Proposal:
    bg_thresh_lo: [0.0,]
    fg_thresh: [0.5,]
    fg_fraction: 0.25
-  bbox_post_process: # used in infer
-    name: BBoxPostProcess
-    # decode -> clip -> nms
-    decode_clip_nms:
-      name: DecodeClipNms
-      keep_top_k: 100
-      score_threshold: 0.05
-      nms_threshold: 0.5
+
+BBoxHead:
+  bbox_feat:
+    name: BBoxFeat
+    roi_extractor:
+      name: RoIAlign
+      resolution: 7
+      sampling_ratio: 2
+    head_feat:
+      name: TwoFCHead
+      in_dim: 256
+      mlp_dim: 1024
+  in_feat: 1024
+
+BBoxPostProcess:
+  decode:
+    name: RCNNBox
+    num_classes: 81
+    batch_size: 1
+  nms:
+    name: MultiClassNMS
+    keep_top_k: 100
+    score_threshold: 0.05
+    nms_threshold: 0.5

 Mask:
  mask_target_generator:
    name: MaskTargetGenerator
    mask_resolution: 28
-  mask_post_process:
-    name: MaskPostProcess
-    mask_resolution: 28
+
+MaskHead:
+  mask_feat:
+    name: MaskFeat
+    num_convs: 4
+    feat_in: 256
+    feat_out: 256
+    mask_roi_extractor:
+      name: RoIAlign
+      resolution: 14
+      sampling_ratio: 2
+    share_bbox_feat: False
+  feat_in: 256
+
+
+MaskPostProcess:
+  mask_resolution: 28


 # Train

--- a/configs/yolov3_darknet.yml
+++ b/configs/yolov3_darknet.yml
@@ -15,6 +15,7 @@ YOLOv3:
  anchor: AnchorYOLO
  backbone: DarkNet
  yolo_head: YOLOv3Head
+  post_process: BBoxPostProcess

 DarkNet:
  depth: 53
@@ -29,27 +30,28 @@ YOLOv3Head:
  label_smooth: true
  anchor_per_position: 3

+BBoxPostProcess:
+  decode:
+    name: YOLOBox
+    conf_thresh: 0.005
+    downsample_ratio: 32
+    clip_bbox: True
+  nms:
+    name: MultiClassNMS
+    keep_top_k: 100
+    score_threshold: 0.01
+    nms_threshold: 0.45
+    nms_top_k: 1000
+    normalized: false
+    background_label: -1
+
+
 AnchorYOLO:
  anchor_generator:
    name: AnchorGeneratorYOLO
    anchors: [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
    anchor_masks: [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
-  anchor_post_process:
-    name: BBoxPostProcessYOLO
-    # decode -> clip
-    yolo_box:
-      name: YOLOBox
-      conf_thresh: 0.005
-      downsample_ratio: 32
-      clip_bbox: True
-    nms:
-      name: MultiClassNMS
-      keep_top_k: 100
-      score_threshold: 0.01
-      nms_threshold: 0.45
-      nms_top_k: 1000
-      normalized: false
-      background_label: -1
+

 LearningRate:
  base_lr: 0.001

--- a/ppdet/modeling/__init__.py
+++ b/ppdet/modeling/__init__.py
@@ -5,6 +5,7 @@ from . import backbone
 from . import neck
 from . import head
 from . import architecture
+from . import post_process

 from .ops import *
 from .bbox import *
@@ -13,3 +14,4 @@ from .backbone import *
 from .neck import *
 from .head import *
 from .architecture import *
+from .post_process import *
--- a/ppdet/modeling/architecture/mask_rcnn.py
+++ b/ppdet/modeling/architecture/mask_rcnn.py
@@ -21,6 +21,8 @@ class MaskRCNN(BaseArch):
        'rpn_head',
        'bbox_head',
        'mask_head',
+        'bbox_post_process',
+        'mask_post_process',
    ]

    def __init__(self,
@@ -31,6 +33,8 @@ class MaskRCNN(BaseArch):
                 rpn_head,
                 bbox_head,
                 mask_head,
+                 bbox_post_process,
+                 mask_post_process,
                 neck=None):
        super(MaskRCNN, self).__init__()
        self.anchor = anchor
@@ -41,6 +45,8 @@ class MaskRCNN(BaseArch):
        self.rpn_head = rpn_head
        self.bbox_head = bbox_head
        self.mask_head = mask_head
+        self.bbox_post_process = bbox_post_process
+        self.mask_post_process = mask_post_process

    def model_arch(self):
        # Backbone
@@ -72,9 +78,11 @@ class MaskRCNN(BaseArch):

        rois_has_mask_int32 = None
        if self.inputs['mode'] == 'infer':
+            bbox_pred, bboxes = self.bbox_head.get_prediction(
+                self.bbox_head_out, rois)
            # Refine bbox by the output from bbox_head at test stage
-            self.bboxes = self.proposal.post_process(self.inputs,
-                                                     self.bbox_head_out, rois)
+            self.bboxes = self.bbox_post_process(bbox_pred, bboxes,
+                                                 self.inputs['im_info'])
        else:
            # Proposal RoI for Mask branch
            # bboxes update at training stage only
@@ -111,7 +119,7 @@ class MaskRCNN(BaseArch):
        return loss

    def infer(self, ):
-        mask = self.mask.post_process(self.bboxes, self.mask_head_out,
+        mask = self.mask_post_process(self.bboxes, self.mask_head_out,
                                      self.inputs['im_info'])
        bbox, bbox_num = self.bboxes
        output = {

--- a/ppdet/modeling/architecture/yolo.py
+++ b/ppdet/modeling/architecture/yolo.py
@@ -15,13 +15,15 @@ class YOLOv3(BaseArch):
        'anchor',
        'backbone',
        'yolo_head',
+        'post_process',
    ]

-    def __init__(self, anchor, backbone, yolo_head):
+    def __init__(self, anchor, backbone, yolo_head, post_process):
        super(YOLOv3, self).__init__()
        self.anchor = anchor
        self.backbone = backbone
        self.yolo_head = yolo_head
+        self.post_process = post_process

    def model_arch(self, ):
        # Backbone
@@ -40,11 +42,11 @@ class YOLOv3(BaseArch):
        return yolo_loss

    def infer(self, ):
-        bbox, bbox_num = self.anchor.post_process(
-            self.inputs['im_size'], self.yolo_head_out, self.mask_anchors)
+        bbox, bbox_num = self.post_process(
+            self.yolo_head_out, self.mask_anchors, self.inputs['im_size'])
        outs = {
            "bbox": bbox.numpy(),
-            "bbox_num": bbox_num,
+            "bbox_num": bbox_num.numpy(),
            'im_id': self.inputs['im_id'].numpy()
        }
        return outs
--- a/ppdet/modeling/bbox.py
+++ b/ppdet/modeling/bbox.py
@@ -8,105 +8,11 @@ from . import ops


 @register
-class BBoxPostProcess(object):
-    __shared__ = ['num_classes']
-    __inject__ = ['decode_clip_nms']
-
-    def __init__(self,
-                 decode_clip_nms,
-                 num_classes=81,
-                 cls_agnostic=False,
-                 decode=None,
-                 clip=None,
-                 nms=None,
-                 score_stage=[0, 1, 2],
-                 delta_stage=[2]):
-        super(BBoxPostProcess, self).__init__()
-        self.num_classes = num_classes
-        self.decode = decode
-        self.clip = clip
-        self.nms = nms
-        self.decode_clip_nms = decode_clip_nms
-        self.score_stage = score_stage
-        self.delta_stage = delta_stage
-        self.out_dim = 2 if cls_agnostic else num_classes
-        self.cls_agnostic = cls_agnostic
-
-    def __call__(self, inputs, bboxheads, rois):
-        # TODO: split into 3 steps
-        # TODO: modify related ops for deploying
-        # decode
-        # clip
-        # nms
-        if isinstance(rois, tuple):
-            proposal, proposal_num = rois
-            score, delta = bboxheads[0]
-            bbox_prob = fluid.layers.softmax(score)
-            delta = fluid.layers.reshape(delta, (-1, self.out_dim, 4))
-        else:
-            num_stage = len(rois)
-            proposal_list = []
-            prob_list = []
-            delta_list = []
-            for stage, (proposals, bboxhead) in zip(rois, bboxheads):
-                score, delta = bboxhead
-                proposal, proposal_num = proposals
-                if stage in self.score_stage:
-                    bbox_prob = fluid.layers.softmax(score)
-                    prob_list.append(bbox_prob)
-                if stage in self.delta_stage:
-                    proposal_list.append(proposal)
-                    delta_list.append(delta)
-            bbox_prob = fluid.layers.mean(prob_list)
-            delta = fluid.layers.mean(delta_list)
-            proposal = fluid.layers.mean(proposal_list)
-            delta = fluid.layers.reshape(delta, (-1, self.out_dim, 4))
-            if self.cls_agnostic:
-                delta = delta[:, 1:2, :]
-                delta = fluid.layers.expand(delta, [1, self.num_classes, 1])
-        bboxes = (proposal, proposal_num)
-        bboxes, bbox_nums = self.decode_clip_nms(bboxes, bbox_prob, delta,
-                                                 inputs['im_info'])
-        return bboxes, bbox_nums
-
-
-@register
-class BBoxPostProcessYOLO(object):
-    __shared__ = ['num_classes']
-    __inject__ = ['yolo_box', 'nms']
-
-    def __init__(self, yolo_box, nms, num_classes=80, decode=None, clip=None):
-        super(BBoxPostProcessYOLO, self).__init__()
-        self.yolo_box = yolo_box
-        self.nms = nms
-        self.num_classes = num_classes
-        self.decode = decode
-        self.clip = clip
-
-    def __call__(self, im_size, yolo_head_out, mask_anchors):
-        # TODO: split yolo_box into 2 steps
-        # decode
-        # clip
-        boxes_list = []
-        scores_list = []
-        for i, head_out in enumerate(yolo_head_out):
-            boxes, scores = self.yolo_box(head_out, im_size, mask_anchors[i],
-                                          self.num_classes, i)
-
-            boxes_list.append(boxes)
-            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
-        yolo_boxes = paddle.concat(boxes_list, axis=1)
-        yolo_scores = paddle.concat(scores_list, axis=2)
-        bbox, bbox_num = self.nms(bboxes=yolo_boxes, scores=yolo_scores)
-        return bbox, bbox_num
-
-
-@register
-class AnchorRPN(object):
+class Anchor(object):
    __inject__ = ['anchor_generator', 'anchor_target_generator']

    def __init__(self, anchor_generator, anchor_target_generator):
-        super(AnchorRPN, self).__init__()
+        super(Anchor, self).__init__()
        self.anchor_generator = anchor_generator
        self.anchor_target_generator = anchor_target_generator

@@ -167,32 +73,24 @@ class AnchorRPN(object):

 @register
 class AnchorYOLO(object):
-    __inject__ = ['anchor_generator', 'anchor_post_process']
+    __inject__ = ['anchor_generator']

-    def __init__(self, anchor_generator, anchor_post_process):
+    def __init__(self, anchor_generator):
        super(AnchorYOLO, self).__init__()
        self.anchor_generator = anchor_generator
-        self.anchor_post_process = anchor_post_process

    def __call__(self):
        return self.anchor_generator()

-    def post_process(self, im_size, yolo_head_out, mask_anchors):
-        return self.anchor_post_process(im_size, yolo_head_out, mask_anchors)
-

 @register
 class Proposal(object):
-    __inject__ = [
-        'proposal_generator', 'proposal_target_generator', 'bbox_post_process'
-    ]
+    __inject__ = ['proposal_generator', 'proposal_target_generator']

-    def __init__(self, proposal_generator, proposal_target_generator,
-                 bbox_post_process):
+    def __init__(self, proposal_generator, proposal_target_generator):
        super(Proposal, self).__init__()
        self.proposal_generator = proposal_generator
        self.proposal_target_generator = proposal_target_generator
-        self.bbox_post_process = bbox_post_process

    def generate_proposal(self, inputs, rpn_head_out, anchor_out):
        rpn_rois_list = []
@@ -294,7 +192,3 @@ class Proposal(object):

    def get_proposals(self):
        return self.proposals_list
-
-    def post_process(self, inputs, bbox_head_out, rois):
-        bboxes = self.bbox_post_process(inputs, bbox_head_out, rois)
-        return bboxes
--- a/ppdet/modeling/head/__init__.py
+++ b/ppdet/modeling/head/__init__.py
@@ -2,8 +2,10 @@ from . import rpn_head
 from . import bbox_head
 from . import mask_head
 from . import yolo_head
+from . import roi_extractor

 from .rpn_head import *
 from .bbox_head import *
 from .mask_head import *
 from .yolo_head import *
+from .roi_extractor import *
--- a/ppdet/modeling/head/bbox_head.py
+++ b/ppdet/modeling/head/bbox_head.py
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Layer
 from paddle.fluid.param_attr import ParamAttr
@@ -5,6 +6,7 @@ from paddle.fluid.initializer import Normal, Xavier
 from paddle.fluid.regularizer import L2Decay
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from ppdet.core.workspace import register
+import paddle.nn.functional as F


 @register
@@ -85,7 +87,9 @@ class BBoxHead(Layer):
                 num_classes=81,
                 cls_agnostic=False,
                 num_stages=1,
-                 with_pool=False):
+                 with_pool=False,
+                 score_stage=[0, 1, 2],
+                 delta_stage=[2]):
        super(BBoxHead, self).__init__()
        self.num_classes = num_classes
        self.delta_dim = 2 if cls_agnostic else num_classes
@@ -94,6 +98,8 @@ class BBoxHead(Layer):
        self.bbox_score_list = []
        self.bbox_delta_list = []
        self.with_pool = with_pool
+        self.score_stage = score_stage
+        self.delta_stage = delta_stage
        for stage in range(num_stages):
            score_name = 'bbox_score_{}'.format(stage)
            delta_name = 'bbox_delta_{}'.format(stage)
@@ -169,3 +175,35 @@ class BBoxHead(Layer):
            loss_bbox[cls_name] = loss_bbox_cls
            loss_bbox[reg_name] = loss_bbox_reg
        return loss_bbox
+
+    def get_prediction(self, bbox_head_out, rois):
+        if len(bbox_head_out) == 1:
+            proposal, proposal_num = rois
+            score, delta = bbox_head_out[0]
+            bbox_prob = F.softmax(score)
+            delta = paddle.reshape(delta, (-1, self.delta_dim, 4))
+        else:
+            num_stage = len(rois)
+            proposal_list = []
+            prob_list = []
+            delta_list = []
+            for stage, (proposals, bboxhead) in zip(rois, bboxheads):
+                score, delta = bboxhead
+                proposal, proposal_num = proposals
+                if stage in self.score_stage:
+                    bbox_prob = F.softmax(score)
+                    prob_list.append(bbox_prob)
+                if stage in self.delta_stage:
+                    proposal_list.append(proposal)
+                    delta_list.append(delta)
+            bbox_prob = paddle.mean(paddle.stack(prob_list), axis=0)
+            delta = paddle.mean(paddle.stack(delta_list), axis=0)
+            proposal = paddle.mean(paddle.stack(proposal_list), axis=0)
+            delta = paddle.reshape(delta, (-1, self.out_dim, 4))
+            if self.cls_agnostic:
+                N, C, M = delta.shape
+                delta = delta[:, 1:2, :]
+                delta = paddle.expand(delta, [N, self.num_classes, M])
+        bboxes = (proposal, proposal_num)
+        bbox_pred = (delta, bbox_prob)
+        return bbox_pred, bboxes
--- a/ppdet/modeling/head/roi_extractor.py
+++ b/ppdet/modeling/head/roi_extractor.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from ppdet.core.workspace import register
+from ppdet.modeling import ops
+
+
+@register
+class RoIAlign(object):
+    def __init__(self,
+                 resolution=14,
+                 sampling_ratio=0,
+                 canconical_level=4,
+                 canonical_size=224,
+                 start_level=0,
+                 end_level=3):
+        super(RoIAlign, self).__init__()
+        self.resolution = resolution
+        self.sampling_ratio = sampling_ratio
+        self.canconical_level = canconical_level
+        self.canonical_size = canonical_size
+        self.start_level = start_level
+        self.end_level = end_level
+
+    def __call__(self, feats, rois, spatial_scale):
+        roi, rois_num = rois
+        cur_l = 0
+        if self.start_level == self.end_level:
+            rois_feat = ops.roi_align(
+                feats[self.start_level],
+                roi,
+                self.resolution,
+                spatial_scale,
+                rois_num=rois_num)
+            return rois_feat
+        offset = 2
+        k_min = self.start_level + offset
+        k_max = self.end_level + offset
+        rois_dist, restore_index, rois_num_dist = ops.distribute_fpn_proposals(
+            roi,
+            k_min,
+            k_max,
+            self.canconical_level,
+            self.canonical_size,
+            rois_num=rois_num)
+
+        rois_feat_list = []
+        for lvl in range(self.start_level, self.end_level + 1):
+            roi_feat = ops.roi_align(
+                feats[lvl],
+                rois_dist[lvl],
+                self.resolution,
+                spatial_scale[lvl],
+                sampling_ratio=self.sampling_ratio,
+                rois_num=rois_num_dist[lvl])
+            rois_feat_list.append(roi_feat)
+        rois_feat_shuffle = paddle.concat(rois_feat_list)
+        rois_feat = paddle.gather(rois_feat_shuffle, restore_index)
+
+        return rois_feat
--- a/ppdet/modeling/layers.py
+++ b/ppdet/modeling/layers.py
@@ -14,12 +14,15 @@

 import numpy as np
 from numbers import Integral
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
 from ppdet.core.workspace import register, serializable
 from ppdet.py_op.target import generate_rpn_anchor_target, generate_proposal_target, generate_mask_target
 from ppdet.py_op.post_process import bbox_post_process
 from . import ops
+import paddle.nn.functional as F


 @register
@@ -278,58 +281,71 @@ class MaskTargetGenerator(object):


 @register
-class RoIExtractor(object):
+@serializable
+class RCNNBox(object):
+    __shared__ = ['num_classes', 'batch_size']
+
    def __init__(self,
-                 resolution=14,
-                 sampling_ratio=0,
-                 canconical_level=4,
-                 canonical_size=224,
-                 start_level=0,
-                 end_level=3):
-        super(RoIExtractor, self).__init__()
-        self.resolution = resolution
-        self.sampling_ratio = sampling_ratio
-        self.canconical_level = canconical_level
-        self.canonical_size = canonical_size
-        self.start_level = start_level
-        self.end_level = end_level
-
-    def __call__(self, feats, rois, spatial_scale):
+                 num_classes=81,
+                 batch_size=1,
+                 prior_box_var=[0.1, 0.1, 0.2, 0.2],
+                 code_type="decode_center_size",
+                 box_normalized=False,
+                 axis=1):
+        super(RCNNBox, self).__init__()
+        self.num_classes = num_classes
+        self.batch_size = batch_size
+        self.prior_box_var = prior_box_var
+        self.code_type = code_type
+        self.box_normalized = box_normalized
+        self.axis = axis
+
+    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
+        bbox_pred, cls_prob = bbox_head_out
        roi, rois_num = rois
-        cur_l = 0
-        if self.start_level == self.end_level:
-            rois_feat = ops.roi_align(
-                feats[self.start_level],
-                roi,
-                self.resolution,
-                spatial_scale,
-                rois_num=rois_num)
-            return rois_feat
-        offset = 2
-        k_min = self.start_level + offset
-        k_max = self.end_level + offset
-        rois_dist, restore_index, rois_num_dist = ops.distribute_fpn_proposals(
-            roi,
-            k_min,
-            k_max,
-            self.canconical_level,
-            self.canonical_size,
-            rois_num=rois_num)
-
-        rois_feat_list = []
-        for lvl in range(self.start_level, self.end_level + 1):
-            roi_feat = ops.roi_align(
-                feats[lvl],
-                rois_dist[lvl],
-                self.resolution,
-                spatial_scale[lvl],
-                sampling_ratio=self.sampling_ratio,
-                rois_num=rois_num_dist[lvl])
-            rois_feat_list.append(roi_feat)
-        rois_feat_shuffle = fluid.layers.concat(rois_feat_list)
-        rois_feat = fluid.layers.gather(rois_feat_shuffle, restore_index)
-
-        return rois_feat
+        origin_shape = im_shape / scale_factor
+        scale_list = []
+        origin_shape_list = []
+        for idx in range(self.batch_size):
+            scale = scale_factor[idx, :]
+            rois_num_per_im = rois_num[idx]
+            expand_scale = paddle.expand(scale, [rois_num_per_im, 1])
+            scale_list.append(expand_scale)
+            expand_im_shape = paddle.expand(origin_shape[idx, :],
+                                            [rois_num_per_im, 2])
+            origin_shape_list.append(expand_im_shape)
+
+        scale = paddle.concat(scale_list)
+        origin_shape = paddle.concat(origin_shape_list)
+
+        bbox = roi / scale
+        bbox = ops.box_coder(
+            prior_box=bbox,
+            prior_box_var=self.prior_box_var,
+            target_box=bbox_pred,
+            code_type=self.code_type,
+            box_normalized=self.box_normalized,
+            axis=self.axis)
+        # TODO: Updata box_clip
+        origin_h = origin_shape[:, 0] - 1
+        origin_w = origin_shape[:, 1] - 1
+        zeros = paddle.zeros(origin_h.shape, 'float32')
+        x1 = paddle.maximum(
+            paddle.minimum(
+                bbox[:, :, 0], origin_w, axis=0), zeros, axis=0)
+        y1 = paddle.maximum(
+            paddle.minimum(
+                bbox[:, :, 1], origin_h, axis=0), zeros, axis=0)
+        x2 = paddle.maximum(
+            paddle.minimum(
+                bbox[:, :, 2], origin_w, axis=0), zeros, axis=0)
+        y2 = paddle.maximum(
+            paddle.minimum(
+                bbox[:, :, 3], origin_h, axis=0), zeros, axis=0)
+        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
+
+        bboxes = (bbox, rois_num)
+        return bboxes, cls_prob


 @register
@@ -367,9 +383,6 @@ class DecodeClipNms(object):
 @register
 @serializable
 class MultiClassNMS(object):
-    __op__ = ops.multiclass_nms
-    __append_doc__ = True
-
    def __init__(self,
                 score_threshold=.05,
                 nms_top_k=-1,
@@ -387,6 +400,13 @@ class MultiClassNMS(object):
        self.nms_eta = nms_eta
        self.background_label = background_label

+    def __call__(self, bboxes, score):
+        kwargs = self.__dict__.copy()
+        if isinstance(bboxes, tuple):
+            bboxes, bbox_num = bboxes
+            kwargs.update({'rois_num': bbox_num})
+        return ops.multiclass_nms(bboxes, score, **kwargs)
+

 @register
 @serializable
@@ -417,19 +437,37 @@ class MatrixNMS(object):
 @register
 @serializable
 class YOLOBox(object):
-    def __init__(
-            self,
-            conf_thresh=0.005,
-            downsample_ratio=32,
-            clip_bbox=True, ):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 conf_thresh=0.005,
+                 downsample_ratio=32,
+                 clip_bbox=True,
+                 scale_x_y=1.):
+        self.num_classes = num_classes
        self.conf_thresh = conf_thresh
        self.downsample_ratio = downsample_ratio
        self.clip_bbox = clip_bbox
+        self.scale_x_y = scale_x_y

-    def __call__(self, x, img_size, anchors, num_classes, stage=0):
-        outs = ops.yolo_box(x, img_size, anchors, num_classes, self.conf_thresh,
-                            self.downsample_ratio // 2**stage, self.clip_bbox)
-        return outs
+    def __call__(self, yolo_head_out, anchors, im_shape, scale_factor=None):
+        boxes_list = []
+        scores_list = []
+        if scale_factor is not None:
+            origin_shape = im_shape / scale_factor
+        else:
+            origin_shape = im_shape
+        for i, head_out in enumerate(yolo_head_out):
+            boxes, scores = ops.yolo_box(head_out, origin_shape, anchors[i],
+                                         self.num_classes, self.conf_thresh,
+                                         self.downsample_ratio // 2**i,
+                                         self.clip_bbox, self.scale_x_y)
+            boxes_list.append(boxes)
+            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
+        yolo_boxes = paddle.concat(boxes_list, axis=1)
+        yolo_scores = paddle.concat(scores_list, axis=2)
+        return yolo_boxes, yolo_scores


 @register

--- a/ppdet/modeling/mask.py
+++ b/ppdet/modeling/mask.py
@@ -2,38 +2,14 @@ import numpy as np
 import paddle.fluid as fluid
 from ppdet.core.workspace import register

-# TODO: regitster mask_post_process op 
-from ppdet.py_op.post_process import mask_post_process
-
-
-@register
-class MaskPostProcess(object):
-    __shared__ = ['mask_resolution']
-
-    def __init__(self, mask_resolution=28, binary_thresh=0.5):
-        super(MaskPostProcess, self).__init__()
-        self.mask_resolution = mask_resolution
-        self.binary_thresh = binary_thresh
-
-    def __call__(self, bboxes, mask_head_out, im_info):
-        # TODO: modify related ops for deploying
-        bboxes_np = (i.numpy() for i in bboxes)
-        mask = mask_post_process(bboxes_np,
-                                 mask_head_out.numpy(),
-                                 im_info.numpy(), self.mask_resolution,
-                                 self.binary_thresh)
-        mask = {'mask': mask}
-        return mask
-

 @register
 class Mask(object):
-    __inject__ = ['mask_target_generator', 'mask_post_process']
+    __inject__ = ['mask_target_generator']

-    def __init__(self, mask_target_generator, mask_post_process):
+    def __init__(self, mask_target_generator):
        super(Mask, self).__init__()
        self.mask_target_generator = mask_target_generator
-        self.mask_post_process = mask_post_process

    def __call__(self, inputs, rois, targets):
        mask_rois, rois_has_mask_int32 = self.generate_mask_target(inputs, rois,
@@ -56,7 +32,3 @@ class Mask(object):

    def get_targets(self):
        return self.mask_int32
-
-    def post_process(self, bboxes, mask_head_out, im_info):
-        mask = self.mask_post_process(bboxes, mask_head_out, im_info)
-        return mask
--- a/ppdet/modeling/ops.py
+++ b/ppdet/modeling/ops.py
@@ -1337,8 +1337,9 @@ def box_coder(prior_box,

        elif isinstance(prior_box_var, list):
            output_box = core.ops.box_coder(
-                prior_box, target_box, "code_type", code_type, "box_normalized",
-                box_normalized, "axis", axis, "variance", prior_box_var)
+                prior_box, None, target_box, "code_type", code_type,
+                "box_normalized", box_normalized, "axis", axis, "variance",
+                prior_box_var)
        else:
            raise TypeError(
                "Input variance of box_coder must be Variable or list")

--- a/ppdet/modeling/post_process.py
+++ b/ppdet/modeling/post_process.py
+import numpy as np
+import paddle.fluid as fluid
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ppdet.py_op.post_process import mask_post_process
+from . import ops
+
+
+@register
+class BBoxPostProcess(object):
+    __inject__ = ['decode', 'nms']
+
+    def __init__(self, decode=None, nms=None):
+        super(BBoxPostProcess, self).__init__()
+        self.decode = decode
+        self.nms = nms
+
+    def __call__(self, head_out, rois, im_shape, scale_factor=None):
+        # TODO: compatible for im_info
+        # remove after unify the im_shape. scale_factor
+        if im_shape.shape[1] > 2:
+            origin_shape = im_shape[:, :2]
+            scale_factor = im_shape[:, 2:]
+        else:
+            origin_shape = im_shape
+        bboxes, score = self.decode(head_out, rois, origin_shape, scale_factor)
+        bbox_pred, bbox_num = self.nms(bboxes, score)
+        return bbox_pred, bbox_num
+
+
+@register
+class MaskPostProcess(object):
+    __shared__ = ['mask_resolution']
+
+    def __init__(self, mask_resolution=28, binary_thresh=0.5):
+        super(MaskPostProcess, self).__init__()
+        self.mask_resolution = mask_resolution
+        self.binary_thresh = binary_thresh
+
+    def __call__(self, bboxes, mask_head_out, im_info):
+        # TODO: modify related ops for deploying
+        bboxes_np = (i.numpy() for i in bboxes)
+        mask = mask_post_process(bboxes_np,
+                                 mask_head_out.numpy(),
+                                 im_info.numpy(), self.mask_resolution,
+                                 self.binary_thresh)
+        mask = {'mask': mask}
+        return mask
--- a/ppdet/utils/eval_utils.py
+++ b/ppdet/utils/eval_utils.py
@@ -85,7 +85,7 @@ def eval_results(res, metric, anno_file):
                json.dump(res['mask'], f)
                logger.info('The mask result is saved to mask.json.')

-            seg_stats = cocoapi_eval('mask.json', 'mask', anno_file=anno_file)
+            seg_stats = cocoapi_eval('mask.json', 'segm', anno_file=anno_file)
            eval_res.append(seg_stats)
            sys.stdout.flush()
    else: