[cherry-pick]fix RCNN dygraph to static (#2192)

* fix RCNN dygraph to static

[cherry-pick]fix RCNN dygraph to static (#2192)
* fix RCNN dygraph to static
3f91ef82 · Guanghua Yu · GitHub · 977c55f9 · 3f91ef82 · 3f91ef82
17 changed file
--- a/dygraph/deploy/python/infer.py
+++ b/dygraph/deploy/python/infer.py
@@ -84,16 +84,8 @@ class Detector(object):
            np_boxes[:, 3] *= w
            np_boxes[:, 4] *= h
            np_boxes[:, 5] *= w
-        expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
-        np_boxes = np_boxes[expect_boxes, :]
-        for box in np_boxes:
-            print('class_id:{:d}, confidence:{:.4f},'
-                  'left_top:[{:.2f},{:.2f}],'
-                  ' right_bottom:[{:.2f},{:.2f}]'.format(
-                      int(box[0]), box[1], box[2], box[3], box[4], box[5]))
        results['boxes'] = np_boxes
        if np_masks is not None:
-            np_masks = np_masks[expect_boxes, :, :, :]
            results['masks'] = np_masks
        return results

@@ -111,7 +103,7 @@ class Detector(object):
            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                            matix element:[class, score, x_min, y_min, x_max, y_max]
                            MaskRCNN's results include 'masks': np.ndarray:
-                            shape:[N, class_num, mask_resolution, mask_resolution]
+                            shape: [N, im_h, im_w]
        '''
        inputs = self.preprocess(image)
        np_boxes, np_masks = None, None
@@ -125,7 +117,7 @@ class Detector(object):
            output_names = self.predictor.get_output_names()
            boxes_tensor = self.predictor.get_output_handle(output_names[0])
            np_boxes = boxes_tensor.copy_to_cpu()
-            if self.pred_config.mask_resolution is not None:
+            if self.pred_config.mask:
                masks_tensor = self.predictor.get_output_handle(output_names[2])
                np_masks = masks_tensor.copy_to_cpu()

@@ -135,14 +127,7 @@ class Detector(object):
            output_names = self.predictor.get_output_names()
            boxes_tensor = self.predictor.get_output_handle(output_names[0])
            np_boxes = boxes_tensor.copy_to_cpu()
-            score_tensor = self.predictor.get_output_handle(output_names[3])
-            np_score = score_tensor.copy_to_cpu()
-            label_tensor = self.predictor.get_output_handle(output_names[2])
-            np_label = label_tensor.copy_to_cpu()
-            np_boxes = np.concatenate(
-                [np_label[:, np.newaxis], np_score[:, np.newaxis], np_boxes],
-                axis=-1)
-            if self.pred_config.mask_resolution is not None:
+            if self.pred_config.mask:
                masks_tensor = self.predictor.get_output_handle(output_names[2])
                np_masks = masks_tensor.copy_to_cpu()
        t2 = time.time()
@@ -196,10 +181,9 @@ class DetectorSOLOv2(Detector):
            image (str/np.ndarray): path of image/ np.ndarray read by cv2
            threshold (float): threshold of predicted box' score
        Returns:
-            results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
-                            matix element:[class, score, x_min, y_min, x_max, y_max]
-                            MaskRCNN's results include 'masks': np.ndarray:
-                            shape:[N, class_num, mask_resolution, mask_resolution]
+            results (dict): 'segm': np.ndarray,shape:[N, im_h, im_w]
+                            'cate_label': label of segm, shape:[N]
+                            'cate_score': confidence score of segm, shape:[N]
        '''
        inputs = self.preprocess(image)
        np_label, np_score, np_segms = None, None, None
@@ -273,9 +257,9 @@ class PredictConfig():
        self.preprocess_infos = yml_conf['Preprocess']
        self.min_subgraph_size = yml_conf['min_subgraph_size']
        self.labels = yml_conf['label_list']
-        self.mask_resolution = None
-        if 'mask_resolution' in yml_conf:
-            self.mask_resolution = yml_conf['mask_resolution']
+        self.mask = False
+        if 'mask' in yml_conf:
+            self.mask = yml_conf['mask']
        self.input_shape = yml_conf['image_shape']
        self.print_config()

@@ -355,19 +339,9 @@ def load_predictor(model_dir,
    return predictor


-def visualize(image_file,
-              results,
-              labels,
-              mask_resolution=14,
-              output_dir='output/',
-              threshold=0.5):
+def visualize(image_file, results, labels, output_dir='output/', threshold=0.5):
    # visualize the predict result
-    im = visualize_box_mask(
-        image_file,
-        results,
-        labels,
-        mask_resolution=mask_resolution,
-        threshold=threshold)
+    im = visualize_box_mask(image_file, results, labels, threshold=threshold)
    img_name = os.path.split(image_file)[-1]
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
@@ -397,7 +371,6 @@ def predict_image(detector):
            FLAGS.image_file,
            results,
            detector.pred_config.labels,
-            mask_resolution=detector.pred_config.mask_resolution,
            output_dir=FLAGS.output_dir,
            threshold=FLAGS.threshold)

@@ -431,7 +404,6 @@ def predict_video(detector, camera_id):
            frame,
            results,
            detector.pred_config.labels,
-            mask_resolution=detector.pred_config.mask_resolution,
            threshold=FLAGS.threshold)
        im = np.array(im)
        writer.write(im)

--- a/dygraph/deploy/python/visualize.py
+++ b/dygraph/deploy/python/visualize.py
@@ -21,16 +21,15 @@ from PIL import Image, ImageDraw
 from scipy import ndimage


-def visualize_box_mask(im, results, labels, mask_resolution=14, threshold=0.5):
+def visualize_box_mask(im, results, labels, threshold=0.5):
    """
    Args:
        im (str/np.ndarray): path of image/np.ndarray read by cv2
        results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box,
                        matix element:[class, score, x_min, y_min, x_max, y_max]
                        MaskRCNN's results include 'masks': np.ndarray:
-                        shape:[N, class_num, mask_resolution, mask_resolution]
+                        shape:[N, im_h, im_w]
        labels (list): labels:['class1', ..., 'classn']
-        mask_resolution (int): shape of a mask is:[mask_resolution, mask_resolution]
        threshold (float): Threshold of score.
    Returns:
        im (PIL.Image.Image): visualized image
@@ -41,13 +40,9 @@ def visualize_box_mask(im, results, labels, mask_resolution=14, threshold=0.5):
        im = Image.fromarray(im)
    if 'masks' in results and 'boxes' in results:
        im = draw_mask(
-            im,
-            results['boxes'],
-            results['masks'],
-            labels,
-            resolution=mask_resolution)
+            im, results['boxes'], results['masks'], labels, threshold=threshold)
    if 'boxes' in results:
-        im = draw_box(im, results['boxes'], labels)
+        im = draw_box(im, results['boxes'], labels, threshold=threshold)
    if 'segm' in results:
        im = draw_segm(
            im,
@@ -80,91 +75,49 @@ def get_color_map_list(num_classes):
    return color_map


-def expand_boxes(boxes, scale=0.0):
-    """
-    Args:
-        boxes (np.ndarray): shape:[N,4], N:number of box,
-                            matix element:[x_min, y_min, x_max, y_max]
-        scale (float): scale of boxes
-    Returns:
-        boxes_exp (np.ndarray): expanded boxes
-    """
-    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
-    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
-    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
-    y_c = (boxes[:, 3] + boxes[:, 1]) * .5
-    w_half *= scale
-    h_half *= scale
-    boxes_exp = np.zeros(boxes.shape)
-    boxes_exp[:, 0] = x_c - w_half
-    boxes_exp[:, 2] = x_c + w_half
-    boxes_exp[:, 1] = y_c - h_half
-    boxes_exp[:, 3] = y_c + h_half
-    return boxes_exp
-
-
-def draw_mask(im, np_boxes, np_masks, labels, resolution=14, threshold=0.5):
+def draw_mask(im, np_boxes, np_masks, labels, threshold=0.5):
    """
    Args:
        im (PIL.Image.Image): PIL image
        np_boxes (np.ndarray): shape:[N,6], N: number of box,
-                               matix element:[class, score, x_min, y_min, x_max, y_max]
-        np_masks (np.ndarray): shape:[N, class_num, resolution, resolution]
+            matix element:[class, score, x_min, y_min, x_max, y_max]
+        np_masks (np.ndarray): shape:[N, im_h, im_w]
        labels (list): labels:['class1', ..., 'classn']
-        resolution (int): shape of a mask is:[resolution, resolution]
        threshold (float): threshold of mask
    Returns:
        im (PIL.Image.Image): visualized image
    """
    color_list = get_color_map_list(len(labels))
-    scale = (resolution + 2.0) / resolution
-    im_w, im_h = im.size
    w_ratio = 0.4
    alpha = 0.7
    im = np.array(im).astype('float32')
-    rects = np_boxes[:, 2:]
-    expand_rects = expand_boxes(rects, scale)
-    expand_rects = expand_rects.astype(np.int32)
-    clsid_scores = np_boxes[:, 0:2]
-    padded_mask = np.zeros((resolution + 2, resolution + 2), dtype=np.float32)
    clsid2color = {}
-    for idx in range(len(np_boxes)):
-        clsid, score = clsid_scores[idx].tolist()
-        clsid = int(clsid)
-        xmin, ymin, xmax, ymax = expand_rects[idx].tolist()
-        w = xmax - xmin + 1
-        h = ymax - ymin + 1
-        w = np.maximum(w, 1)
-        h = np.maximum(h, 1)
-        padded_mask[1:-1, 1:-1] = np_masks[idx, int(clsid), :, :]
-        resized_mask = cv2.resize(padded_mask, (w, h))
-        resized_mask = np.array(resized_mask > threshold, dtype=np.uint8)
-        x0 = min(max(xmin, 0), im_w)
-        x1 = min(max(xmax + 1, 0), im_w)
-        y0 = min(max(ymin, 0), im_h)
-        y1 = min(max(ymax + 1, 0), im_h)
-        im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
-        im_mask[y0:y1, x0:x1] = resized_mask[(y0 - ymin):(y1 - ymin), (
-            x0 - xmin):(x1 - xmin)]
+    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
+    np_boxes = np_boxes[expect_boxes, :]
+    np_masks = np_masks[expect_boxes, :, :]
+    for i in range(len(np_masks)):
+        clsid, score = int(np_boxes[i][0]), np_boxes[i][1]
+        mask = np_masks[i]
        if clsid not in clsid2color:
            clsid2color[clsid] = color_list[clsid]
        color_mask = clsid2color[clsid]
        for c in range(3):
            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
-        idx = np.nonzero(im_mask)
+        idx = np.nonzero(mask)
        color_mask = np.array(color_mask)
        im[idx[0], idx[1], :] *= 1.0 - alpha
        im[idx[0], idx[1], :] += alpha * color_mask
    return Image.fromarray(im.astype('uint8'))


-def draw_box(im, np_boxes, labels):
+def draw_box(im, np_boxes, labels, threshold=0.5):
    """
    Args:
        im (PIL.Image.Image): PIL image
        np_boxes (np.ndarray): shape:[N,6], N: number of box,
                               matix element:[class, score, x_min, y_min, x_max, y_max]
        labels (list): labels:['class1', ..., 'classn']
+        threshold (float): threshold of box
    Returns:
        im (PIL.Image.Image): visualized image
    """
@@ -172,10 +125,15 @@ def draw_box(im, np_boxes, labels):
    draw = ImageDraw.Draw(im)
    clsid2color = {}
    color_list = get_color_map_list(len(labels))
+    expect_boxes = (np_boxes[:, 1] > threshold) & (np_boxes[:, 0] > -1)
+    np_boxes = np_boxes[expect_boxes, :]

    for dt in np_boxes:
        clsid, bbox, score = int(dt[0]), dt[2:], dt[1]
        xmin, ymin, xmax, ymax = bbox
+        print('class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}],'
+              'right_bottom:[{:.2f},{:.2f}]'.format(
+                  int(clsid), score, xmin, ymin, xmax, ymax))
        w = xmax - xmin
        h = ymax - ymin
        if clsid not in clsid2color:

--- a/dygraph/ppdet/engine/export_utils.py
+++ b/dygraph/ppdet/engine/export_utils.py
@@ -98,9 +98,8 @@ def _dump_infer_config(config, path, image_shape, model):
            'Architecture: {} is not supported for exporting model now'.format(
                infer_arch))
        os._exit(0)
-    if 'mask_post_process' in model.__dict__ and model.__dict__[
-            'mask_post_process']:
-        infer_cfg['mask_resolution'] = model.mask_post_process.mask_resolution
+    if 'Mask' in infer_arch:
+        infer_cfg['mask'] = True
    infer_cfg['Preprocess'], infer_cfg[
        'label_list'], image_shape = _parse_reader(
            config['TestReader'], config['TestDataset'], config['metric'],

--- a/dygraph/ppdet/metrics/coco_utils.py
+++ b/dygraph/ppdet/metrics/coco_utils.py
@@ -30,7 +30,7 @@ def get_infer_results(outs, catid, bias=0):
    The output format is dictionary containing bbox or mask result.

    For example, bbox result is a list and each element contains
-    image_id, category_id, bbox and score. 
+    image_id, category_id, bbox and score.
    """
    if outs is None or len(outs) == 0:
        raise ValueError(
@@ -42,19 +42,12 @@ def get_infer_results(outs, catid, bias=0):
    infer_res = {}
    if 'bbox' in outs:
        infer_res['bbox'] = get_det_res(
-            outs['bbox'],
-            outs['score'],
-            outs['label'],
-            outs['bbox_num'],
-            im_id,
-            catid,
-            bias=bias)
+            outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)

    if 'mask' in outs:
        # mask post process
-        infer_res['mask'] = get_seg_res(outs['mask'], outs['score'],
-                                        outs['label'], outs['bbox_num'], im_id,
-                                        catid)
+        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
+                                        outs['bbox_num'], im_id, catid)

    if 'segm' in outs:
        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)

--- a/dygraph/ppdet/modeling/architectures/faster_rcnn.py
+++ b/dygraph/ppdet/modeling/architectures/faster_rcnn.py
@@ -99,13 +99,5 @@ class FasterRCNN(BaseArch):

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
-        label = bbox_pred[:, 0]
-        score = bbox_pred[:, 1]
-        bbox = bbox_pred[:, 2:]
-        output = {
-            'bbox': bbox,
-            'score': score,
-            'label': label,
-            'bbox_num': bbox_num
-        }
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output
--- a/dygraph/ppdet/modeling/architectures/fcos.py
+++ b/dygraph/ppdet/modeling/architectures/fcos.py
@@ -91,13 +91,5 @@ class FCOS(BaseArch):

    def get_pred(self):
        bboxes, bbox_num = self._forward()
-        label = bboxes[:, 0]
-        score = bboxes[:, 1]
-        bbox = bboxes[:, 2:]
-        output = {
-            'bbox': bbox,
-            'score': score,
-            'label': label,
-            'bbox_num': bbox_num
-        }
+        output = {'bbox': bboxes, 'bbox_num': bbox_num}
        return output
--- a/dygraph/ppdet/modeling/architectures/mask_rcnn.py
+++ b/dygraph/ppdet/modeling/architectures/mask_rcnn.py
@@ -124,14 +124,5 @@ class MaskRCNN(BaseArch):

    def get_pred(self):
        bbox_pred, bbox_num, mask_pred = self._forward()
-        label = bbox_pred[:, 0]
-        score = bbox_pred[:, 1]
-        bbox = bbox_pred[:, 2:]
-        output = {
-            'label': label,
-            'score': score,
-            'bbox': bbox,
-            'bbox_num': bbox_num,
-            'mask': mask_pred,
-        }
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
        return output
--- a/dygraph/ppdet/modeling/architectures/ttfnet.py
+++ b/dygraph/ppdet/modeling/architectures/ttfnet.py
@@ -91,13 +91,8 @@ class TTFNet(BaseArch):

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
-        label = bbox_pred[:, 0]
-        score = bbox_pred[:, 1]
-        bbox = bbox_pred[:, 2:]
        output = {
-            "bbox": bbox,
-            'score': score,
-            'label': label,
+            "bbox": bbox_pred,
            "bbox_num": bbox_num,
        }
        return output
--- a/dygraph/ppdet/modeling/architectures/yolo.py
+++ b/dygraph/ppdet/modeling/architectures/yolo.py
@@ -61,13 +61,5 @@ class YOLOv3(BaseArch):

    def get_pred(self):
        bbox_pred, bbox_num = self._forward()
-        label = bbox_pred[:, 0]
-        score = bbox_pred[:, 1]
-        bbox = bbox_pred[:, 2:]
-        output = {
-            'bbox': bbox,
-            'score': score,
-            'label': label,
-            'bbox_num': bbox_num
-        }
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
        return output
--- a/dygraph/ppdet/modeling/bbox_utils.py
+++ b/dygraph/ppdet/modeling/bbox_utils.py
@@ -39,8 +39,6 @@ def bbox2delta(src_boxes, tgt_boxes, weights):

 def delta2bbox(deltas, boxes, weights):
    clip_scale = math.log(1000.0 / 16)
-    if boxes.shape[0] == 0:
-        return paddle.zeros((0, deltas.shape[1]), dtype='float32')

    widths = boxes[:, 2] - boxes[:, 0]
    heights = boxes[:, 3] - boxes[:, 1]
@@ -61,12 +59,13 @@ def delta2bbox(deltas, boxes, weights):
    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
    pred_h = paddle.exp(dh) * heights.unsqueeze(1)

-    pred_boxes = paddle.zeros_like(deltas)
+    pred_boxes = []
+    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
+    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
+    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
+    pred_boxes = paddle.stack(pred_boxes, axis=-1)

-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
    return pred_boxes



--- a/dygraph/ppdet/modeling/heads/bbox_head.py
+++ b/dygraph/ppdet/modeling/heads/bbox_head.py
@@ -141,8 +141,7 @@ class BBoxHead(nn.Layer):

        rois_feat = self.roi_extractor(body_feats, rois, rois_num)
        bbox_feat = self.head(rois_feat)
-        #if self.with_pool:
-        if len(bbox_feat.shape) > 2 and bbox_feat.shape[-1] > 1:
+        if self.with_pool:
            feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1)
            feat = paddle.squeeze(feat, axis=[2, 3])
        else:

--- a/dygraph/ppdet/modeling/heads/mask_head.py
+++ b/dygraph/ppdet/modeling/heads/mask_head.py
@@ -182,11 +182,12 @@ class MaskHead(nn.Layer):
                mask_out = F.sigmoid(mask_logit)
            else:
                num_masks = mask_logit.shape[0]
-                pred_masks = paddle.split(mask_logit, num_masks)
                mask_out = []
                # TODO: need to optimize gather
-                for i, pred_mask in enumerate(pred_masks):
-                    mask = paddle.gather(pred_mask, labels[i], axis=1)
+                for i in range(mask_logit.shape[0]):
+                    pred_masks = paddle.unsqueeze(
+                        mask_logit[i, :, :, :], axis=0)
+                    mask = paddle.gather(pred_masks, labels[i], axis=1)
                    mask_out.append(mask)
                mask_out = F.sigmoid(paddle.concat(mask_out))
        return mask_out

--- a/dygraph/ppdet/modeling/layers.py
+++ b/dygraph/ppdet/modeling/layers.py
@@ -316,14 +316,12 @@ class RCNNBox(object):

        # [N, C*4]
        bbox = paddle.concat(roi)
-        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
+        if bbox.shape[0] == 0:
+            bbox = paddle.zeros([0, bbox_pred.shape[1]], dtype='float32')
+        else:
+            bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
        scores = cls_prob[:, :-1]

-        # [N*C, 4]
-
-        bbox_num_class = bbox.shape[1] // 4
-        bbox = paddle.reshape(bbox, [-1, bbox_num_class, 4])
-
        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
        zeros = paddle.zeros_like(origin_h)

--- a/dygraph/ppdet/modeling/post_process.py
+++ b/dygraph/ppdet/modeling/post_process.py
@@ -54,8 +54,6 @@ class BBoxPostProcess(object):
                               including labels, scores and bboxes. The size of
                               bboxes are corresponding to the original image.
        """
-        if bboxes.shape[0] == 0:
-            return paddle.zeros(shape=[1, 6])

        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)

@@ -65,9 +63,12 @@ class BBoxPostProcess(object):
        for i in range(bbox_num.shape[0]):
            expand_shape = paddle.expand(origin_shape[i:i + 1, :],
                                         [bbox_num[i], 2])
-            scale_y, scale_x = scale_factor[i]
+            scale_y, scale_x = scale_factor[i][0], scale_factor[i][1]
            scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
            expand_scale = paddle.expand(scale, [bbox_num[i], 4])
+            # TODO: Because paddle.expand transform error when dygraph
+            # to static, use reshape to avoid mistakes.
+            expand_scale = paddle.reshape(expand_scale, [bbox_num[i], 4])
            origin_shape_list.append(expand_shape)
            scale_factor_list.append(expand_scale)

@@ -121,6 +122,10 @@ class MaskPostProcess(object):

        gx = paddle.expand(img_x, [N, img_y.shape[1], img_x.shape[2]])
        gy = paddle.expand(img_y, [N, img_y.shape[1], img_x.shape[2]])
+        # TODO: Because paddle.expand transform error when dygraph
+        # to static, use reshape to avoid mistakes.
+        gx = paddle.reshape(gx, [N, img_y.shape[1], img_x.shape[2]])
+        gy = paddle.reshape(gy, [N, img_y.shape[1], img_x.shape[2]])
        grid = paddle.stack([gx, gy], axis=3)
        img_masks = F.grid_sample(masks, grid, align_corners=False)
        return img_masks[:, 0]
@@ -129,19 +134,24 @@ class MaskPostProcess(object):
        """
        Paste the mask prediction to the original image.
        """
-        assert bboxes.shape[0] > 0, 'There is no detection output'
-
        num_mask = mask_out.shape[0]
-        # TODO: support bs > 1
+        origin_shape = paddle.cast(origin_shape, 'int32')
+        # TODO: support bs > 1 and mask output dtype is bool
        pred_result = paddle.zeros(
-            [num_mask, origin_shape[0][0], origin_shape[0][1]], dtype='bool')
+            [num_mask, origin_shape[0][0], origin_shape[0][1]], dtype='int32')
+        if bboxes.shape[0] == 0:
+            return pred_result
+
        # TODO: optimize chunk paste
+        pred_result = []
        for i in range(bboxes.shape[0]):
-            im_h, im_w = origin_shape[i]
+            im_h, im_w = origin_shape[i][0], origin_shape[i][1]
            pred_mask = self.paste_mask(mask_out[i], bboxes[i:i + 1, 2:], im_h,
                                        im_w)
            pred_mask = pred_mask >= self.binary_thresh
-            pred_result[i] = pred_mask
+            pred_mask = paddle.cast(pred_mask, 'int32')
+            pred_result.append(pred_mask)
+        pred_result = paddle.concat(pred_result)
        return pred_result



--- a/dygraph/ppdet/modeling/proposal_generator/anchor_generator.py
+++ b/dygraph/ppdet/modeling/proposal_generator/anchor_generator.py
@@ -24,7 +24,7 @@ from .. import ops


 @register
-class AnchorGenerator(object):
+class AnchorGenerator(nn.Layer):
    def __init__(self,
                 anchor_sizes=[32, 64, 128, 256, 512],
                 aspect_ratios=[0.5, 1.0, 2.0],
@@ -64,17 +64,21 @@ class AnchorGenerator(object):
            self.generate_cell_anchors(s, a)
            for s, a in zip(sizes, aspect_ratios)
        ]
+        [
+            self.register_buffer(
+                t.name, t, persistable=False) for t in cell_anchors
+        ]
        return cell_anchors

    def _create_grid_offsets(self, size, stride, offset):
-        grid_height, grid_width = size
+        grid_height, grid_width = size[0], size[1]
        shifts_x = paddle.arange(
            offset * stride, grid_width * stride, step=stride, dtype='float32')
        shifts_y = paddle.arange(
            offset * stride, grid_height * stride, step=stride, dtype='float32')
        shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x)
-        shift_x = shift_x.reshape([-1])
-        shift_y = shift_y.reshape([-1])
+        shift_x = paddle.reshape(shift_x, [-1])
+        shift_y = paddle.reshape(shift_y, [-1])
        return shift_x, shift_y

    def _grid_anchors(self, grid_sizes):
@@ -84,14 +88,15 @@ class AnchorGenerator(object):
            shift_x, shift_y = self._create_grid_offsets(size, stride,
                                                         self.offset)
            shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1)
+            shifts = paddle.reshape(shifts, [-1, 1, 4])
+            base_anchors = paddle.reshape(base_anchors, [1, -1, 4])

-            anchors.append((shifts.reshape([-1, 1, 4]) + base_anchors.reshape(
-                [1, -1, 4])).reshape([-1, 4]))
+            anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4]))

        return anchors

-    def __call__(self, input):
-        grid_sizes = [feature_map.shape[-2:] for feature_map in input]
+    def forward(self, input):
+        grid_sizes = [paddle.shape(feature_map)[-2:] for feature_map in input]
        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
        return anchors_over_all_feature_maps

@@ -105,4 +110,4 @@ class AnchorGenerator(object):
                ratios and 5 sizes, the number of anchors is 15.
                For FPN models, `num_anchors` on every feature map is the same.
        """
-        return self.cell_anchors[0].shape[0]
+        return len(self.cell_anchors[0])
--- a/dygraph/ppdet/modeling/proposal_generator/rpn_head.py
+++ b/dygraph/ppdet/modeling/proposal_generator/rpn_head.py
@@ -108,7 +108,14 @@ class RPNHead(nn.Layer):

        anchors = self.anchor_generator(rpn_feats)

-        rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs)
+        # TODO: Fix batch_size > 1 when testing.
+        if self.training:
+            batch_size = im_shape.shape[0]
+        else:
+            batch_size = 1
+
+        rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs,
+                                            batch_size)

        if self.training:
            loss = self.get_loss(scores, deltas, anchors, inputs)
@@ -116,16 +123,15 @@ class RPNHead(nn.Layer):
        else:
            return rois, rois_num, None

-    def _gen_proposal(self, scores, bbox_deltas, anchors, inputs):
+    def _gen_proposal(self, scores, bbox_deltas, anchors, inputs, batch_size):
        """
-        scores (list[Tensor]): Multi-level scores prediction 
+        scores (list[Tensor]): Multi-level scores prediction
        bbox_deltas (list[Tensor]): Multi-level deltas prediction
-        anchors (list[Tensor]): Multi-level anchors 
+        anchors (list[Tensor]): Multi-level anchors
        inputs (dict): ground truth info
        """
        prop_gen = self.train_proposal if self.training else self.test_proposal
        im_shape = inputs['im_shape']
-        batch_size = im_shape.shape[0]
        rpn_rois_list = [[] for i in range(batch_size)]
        rpn_prob_list = [[] for i in range(batch_size)]
        rpn_rois_num_list = [[] for i in range(batch_size)]

--- a/dygraph/ppdet/py_op/post_process.py
+++ b/dygraph/ppdet/py_op/post_process.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 import os
 import numpy as np
 import cv2


-def get_det_res(bboxes,
-                scores,
-                labels,
-                bbox_nums,
-                image_id,
-                label_to_cat_id_map,
-                bias=0):
+def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
    det_res = []
    k = 0
    for i in range(len(bbox_nums)):
        cur_image_id = int(image_id[i][0])
        det_nums = bbox_nums[i]
        for j in range(det_nums):
-            box = bboxes[k]
-            score = float(scores[k])
-            label = int(labels[k])
-            if label < 0: continue
+            dt = bboxes[k]
            k = k + 1
-            xmin, ymin, xmax, ymax = box.tolist()
-            category_id = label_to_cat_id_map[label]
+            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            if int(num_id) < 0:
+                continue
+            category_id = label_to_cat_id_map[int(num_id)]
            w = xmax - xmin + bias
            h = ymax - ymin + bias
            bbox = [xmin, ymin, w, h]
@@ -37,8 +43,7 @@ def get_det_res(bboxes,
    return det_res


-def get_seg_res(masks, scores, labels, mask_nums, image_id,
-                label_to_cat_id_map):
+def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
    import pycocotools.mask as mask_util
    seg_res = []
    k = 0
@@ -46,9 +51,9 @@ def get_seg_res(masks, scores, labels, mask_nums, image_id,
        cur_image_id = int(image_id[i][0])
        det_nums = mask_nums[i]
        for j in range(det_nums):
-            mask = masks[k]
-            score = float(scores[k])
-            label = int(labels[k])
+            mask = masks[k].astype(np.uint8)
+            score = float(bboxes[k][1])
+            label = int(bboxes[k][0])
            k = k + 1
            cat_id = label_to_cat_id_map[label]
            rle = mask_util.encode(