refactor rcnn, test=dygraph (#2114)

* refactor rcnn, test=dygraph * add mask_rcnn, test=dygraph * add Faster RCNN & Faster FPN, test=dygraph * update according to the review, test=dygraph

refactor rcnn, test=dygraph (#2114)
* refactor rcnn, test=dygraph * add mask_rcnn, test=dygraph * add Faster RCNN & Faster FPN, test=dygraph * update according to the review, test=dygraph
944eee5c · wangguanzhong · GitHub · 9a2651fe · 944eee5c · 944eee5c
52 changed file
--- a/dygraph/configs/faster_rcnn/_base_/faster_fpn_reader.yml
+++ b/dygraph/configs/faster_rcnn/_base_/faster_fpn_reader.yml
 worker_num: 2
 TrainReader:
  sample_transforms:
-  - DecodeOp: { }
+  - DecodeOp: {}
-  - RandomFlipImage: {prob: 0.5}
+  - RandomResizeOp: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True}
-  - NormalizeImage: {is_channel_first: false, is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - RandomFlipOp: {prob: 0.5}
-  - ResizeImage: {target_size: 800, max_size: 1333, interp: 1, use_cv2: true}
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - Permute: {to_bgr: false, channel_first: true}
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatch: {pad_to_stride: 32, use_padded_im_info: false, pad_gt: true}
+  - PadBatchOp: {pad_to_stride: 32, pad_gt: true}
  batch_size: 1
  shuffle: true
  drop_last: true
@@ -15,12 +15,12 @@ TrainReader:
 EvalReader:
  sample_transforms:
-  - DecodeOp: { }
+  - DecodeOp: {}
-  - NormalizeImageOp: { is_scale: true, mean: [ 0.485,0.456,0.406 ], std: [ 0.229, 0.224,0.225 ] }
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
-  - ResizeOp: { interp: 1, target_size: [ 800, 1333 ], keep_ratio: True }
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - PermuteOp: { }
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatchOp: { pad_to_stride: 32, pad_gt: false }
+  - PadBatchOp: {pad_to_stride: 32, pad_gt: false}
  batch_size: 1
  shuffle: false
  drop_last: false
@@ -29,12 +29,12 @@ EvalReader:
 TestReader:
  sample_transforms:
-  - DecodeOp: { }
+  - DecodeOp: {}
-  - NormalizeImageOp: { is_scale: true, mean: [ 0.485,0.456,0.406 ], std: [ 0.229, 0.224,0.225 ] }
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
-  - ResizeOp: { interp: 1, target_size: [ 800, 1333 ], keep_ratio: True }
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - PermuteOp: { }
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatchOp: { pad_to_stride: 32, pad_gt: false }
+  - PadBatchOp: {pad_to_stride: 32, pad_gt: false}
  batch_size: 1
  shuffle: false
  drop_last: false
--- a/dygraph/configs/faster_rcnn/_base_/faster_rcnn_r50.yml
+++ b/dygraph/configs/faster_rcnn/_base_/faster_rcnn_r50.yml
@@ -2,12 +2,7 @@ architecture: FasterRCNN
 pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar
 load_static_weights: True
-# Model Achitecture
 FasterRCNN:
-  # model anchor info flow
-  anchor: Anchor
-  proposal: Proposal
-  # model feat info flow
  backbone: ResNet
  rpn_head: RPNHead
  bbox_head: BBoxHead
@@ -24,70 +19,50 @@ ResNet:
  num_stages: 3
 RPNHead:
-  rpn_feat:
-    name: RPNFeat
-    feat_in: 1024
-    feat_out: 1024
-  anchor_per_position: 15
-  rpn_channel: 1024
-Anchor:
  anchor_generator:
-    name: AnchorGeneratorRPN
-    anchor_sizes: [32, 64, 128, 256, 512]
    aspect_ratios: [0.5, 1.0, 2.0]
-    stride: [16.0, 16.0]
+    anchor_sizes: [32, 64, 128, 256, 512]
-    variance: [1.0, 1.0, 1.0, 1.0]
+    strides: [16]
-  anchor_target_generator:
+  rpn_target_assign:
-    name: AnchorTargetGeneratorRPN
    batch_size_per_im: 256
    fg_fraction: 0.5
    negative_overlap: 0.3
    positive_overlap: 0.7
-    straddle_thresh: 0.0
+    use_random: True
+  train_proposal:
-Proposal:
-  proposal_generator:
-    name: ProposalGenerator
    min_size: 0.0
    nms_thresh: 0.7
-    train_pre_nms_top_n: 12000
+    pre_nms_top_n: 12000
-    train_post_nms_top_n: 2000
+    post_nms_top_n: 2000
-    infer_pre_nms_top_n: 6000
+    topk_after_collect: True
-    infer_post_nms_top_n: 1000
+  test_proposal:
-  proposal_target_generator:
+    min_size: 0.0
-    name: ProposalTargetGenerator
+    nms_thresh: 0.7
-    batch_size_per_im: 512
+    pre_nms_top_n: 6000
-    bbox_reg_weights: [0.1, 0.1, 0.2, 0.2]
+    post_nms_top_n: 1000
-    bg_thresh_hi: [0.5,]
-    bg_thresh_lo: [0.0,]
-    fg_thresh: [0.5,]
-    fg_fraction: 0.25
 BBoxHead:
-  bbox_feat:
+  head: Res5Head
-    name: BBoxFeat
  roi_extractor:
-      name: RoIAlign
    resolution: 14
    sampling_ratio: 0
-      start_level: 0
+    aligned: True
-      end_level: 0
+  bbox_assigner: BBoxAssigner
-    head_feat:
-      name: Res5Head
-      feat_in: 1024
-      feat_out: 512
  with_pool: true
-  in_feat: 2048
+BBoxAssigner:
+  batch_size_per_im: 512
+  bg_thresh: [0.5,]
+  fg_thresh: [0.5,]
+  fg_fraction: 0.25
+  use_random: True
 BBoxPostProcess:
-  decode:
+  decode: RCNNBox
-    name: RCNNBox
-    num_classes: 81
-    batch_size: 1
  nms:
    name: MultiClassNMS
    keep_top_k: 100
    score_threshold: 0.05
    nms_threshold: 0.5
+    normalized: true
--- a/dygraph/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
+++ b/dygraph/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
@@ -2,12 +2,7 @@ architecture: FasterRCNN
 pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar
 load_static_weights: True
-# Model Achitecture
 FasterRCNN:
-  # model anchor info flow
-  anchor: Anchor
-  proposal: Proposal
-  # model feat info flow
  backbone: ResNet
  neck: FPN
  rpn_head: RPNHead
@@ -25,72 +20,56 @@ ResNet:
  num_stages: 4
 FPN:
-  in_channels: [256, 512, 1024, 2048]
  out_channel: 256
-  min_level: 0
-  max_level: 4
-  spatial_scale: [0.25, 0.125, 0.0625, 0.03125]
 RPNHead:
-  rpn_feat:
-    name: RPNFeat
-    feat_in: 256
-    feat_out: 256
-  anchor_per_position: 3
-  rpn_channel: 256
-Anchor:
  anchor_generator:
-    name: AnchorGeneratorRPN
    aspect_ratios: [0.5, 1.0, 2.0]
-    anchor_start_size: 32
+    anchor_sizes: [[32], [64], [128], [256], [512]]
-    stride: [4., 4.]
+    strides: [4, 8, 16, 32, 64]
-  anchor_target_generator:
+  rpn_target_assign:
-    name: AnchorTargetGeneratorRPN
    batch_size_per_im: 256
    fg_fraction: 0.5
    negative_overlap: 0.3
    positive_overlap: 0.7
-    straddle_thresh: 0.0
+    use_random: True
+  train_proposal:
-Proposal:
-  proposal_generator:
-    name: ProposalGenerator
    min_size: 0.0
    nms_thresh: 0.7
-    train_pre_nms_top_n: 2000
+    pre_nms_top_n: 2000
-    train_post_nms_top_n: 2000
+    post_nms_top_n: 1000
-    infer_pre_nms_top_n: 1000
+    topk_after_collect: True
-    infer_post_nms_top_n: 1000
+  test_proposal:
-  proposal_target_generator:
+    min_size: 0.0
-    name: ProposalTargetGenerator
+    nms_thresh: 0.7
-    batch_size_per_im: 512
+    pre_nms_top_n: 1000
-    bbox_reg_weights: [0.1, 0.1, 0.2, 0.2]
+    post_nms_top_n: 1000
-    bg_thresh_hi: [0.5,]
-    bg_thresh_lo: [0.0,]
-    fg_thresh: [0.5,]
-    fg_fraction: 0.25
 BBoxHead:
-  bbox_feat:
+  head: TwoFCHead
-    name: BBoxFeat
  roi_extractor:
-      name: RoIAlign
    resolution: 7
-      sampling_ratio: 2
+    sampling_ratio: 0
-    head_feat:
+    aligned: True
-      name: TwoFCHead
+  bbox_assigner: BBoxAssigner
-      in_dim: 256
+BBoxAssigner:
+  batch_size_per_im: 512
+  bg_thresh: [0.5,]
+  fg_thresh: [0.5,]
+  fg_fraction: 0.25
+  use_random: True
+TwoFCHead:
  mlp_dim: 1024
-  in_feat: 1024
 BBoxPostProcess:
-  decode:
+  decode: RCNNBox
-    name: RCNNBox
-    num_classes: 81
-    batch_size: 1
  nms:
    name: MultiClassNMS
    keep_top_k: 100
    score_threshold: 0.05
    nms_threshold: 0.5
+    normalized: true
--- a/dygraph/configs/faster_rcnn/_base_/faster_reader.yml
+++ b/dygraph/configs/faster_rcnn/_base_/faster_reader.yml
 worker_num: 2
 TrainReader:
  sample_transforms:
-  - DecodeOp: { }
+  - DecodeOp: {}
-  - RandomFlipImage: {prob: 0.5}
+  - RandomResizeOp: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True}
-  - NormalizeImage: {is_channel_first: false, is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - RandomFlipOp: {prob: 0.5}
-  - ResizeImage: {target_size: 800, max_size: 1333, interp: 1, use_cv2: true}
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - Permute: {to_bgr: false, channel_first: true}
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatch: {pad_to_stride: -1, use_padded_im_info: false, pad_gt: true}
+  - PadBatchOp: {pad_to_stride: -1., pad_gt: true}
  batch_size: 1
  shuffle: true
  drop_last: true
@@ -15,12 +15,12 @@ TrainReader:
 EvalReader:
  sample_transforms:
-  - DecodeOp: { }
+  - DecodeOp: {}
-  - NormalizeImageOp: { is_scale: true, mean: [ 0.485,0.456,0.406 ], std: [ 0.229, 0.224,0.225 ] }
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
-  - ResizeOp: { interp: 1, target_size: [ 800, 1333 ], keep_ratio: True }
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - PermuteOp: { }
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatchOp: { pad_to_stride: -1, pad_gt: false }
+  - PadBatchOp: {pad_to_stride: -1., pad_gt: false}
  batch_size: 1
  shuffle: false
  drop_last: false
@@ -29,12 +29,12 @@ EvalReader:
 TestReader:
  sample_transforms:
-  - DecodeOp: { }
+  - DecodeOp: {}
-  - NormalizeImageOp: { is_scale: true, mean: [ 0.485,0.456,0.406 ], std: [ 0.229, 0.224,0.225 ] }
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
-  - ResizeOp: { interp: 1, target_size: [ 800, 1333 ], keep_ratio: True }
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - PermuteOp: { }
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatchOp: { pad_to_stride: -1, pad_gt: false }
+  - PadBatchOp: {pad_to_stride: -1., pad_gt: false}
  batch_size: 1
  shuffle: false
  drop_last: false
--- a/dygraph/configs/faster_rcnn/_base_/optimizer_1x.yml
+++ b/dygraph/configs/faster_rcnn/_base_/optimizer_1x.yml
@@ -7,8 +7,8 @@ LearningRate:
    gamma: 0.1
    milestones: [8, 11]
  - !LinearWarmup
-    start_factor: 0.3333333333333333
+    start_factor: 0.1
-    steps: 500
+    steps: 1000
 OptimizerBuilder:
  optimizer:

--- a/dygraph/configs/mask_rcnn/_base_/mask_fpn_reader.yml
+++ b/dygraph/configs/mask_rcnn/_base_/mask_fpn_reader.yml
@@ -2,22 +2,21 @@ worker_num: 2
 TrainReader:
  sample_transforms:
  - DecodeOp: {}
-  - RandomFlipImage: {prob: 0.5, is_mask_flip: true}
+  - RandomResizeOp: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True}
-  - NormalizeImage: {is_channel_first: false, is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - RandomFlipOp: {prob: 0.5, is_mask_flip: true}
-  - ResizeImage: {target_size: 800, max_size: 1333, interp: 1, use_cv2: true}
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - Permute: {to_bgr: false, channel_first: true}
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatch: {pad_to_stride: 32, use_padded_im_info: false, pad_gt: true}
+  - PadBatchOp: {pad_to_stride: 32, pad_gt: true}
  batch_size: 1
  shuffle: true
  drop_last: true
 EvalReader:
  sample_transforms:
  - DecodeOp: {}
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - ResizeOp: {interp: 1, target_size: [800, 1333], keep_ratio: True}
  - PermuteOp: {}
  batch_transforms:
  - PadBatchOp: {pad_to_stride: 32, pad_gt: false}
@@ -30,8 +29,8 @@ EvalReader:
 TestReader:
  sample_transforms:
  - DecodeOp: {}
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - ResizeOp: {interp: 1, target_size: [800, 1333], keep_ratio: True}
  - PermuteOp: {}
  batch_transforms:
  - PadBatchOp: {pad_to_stride: 32, pad_gt: false}

--- a/dygraph/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
+++ b/dygraph/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
@@ -2,13 +2,7 @@ architecture: MaskRCNN
 pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar
 load_static_weights: True
-# Model Achitecture
 MaskRCNN:
-  # model anchor info flow
-  anchor: Anchor
-  proposal: Proposal
-  mask: Mask
-  # model feat info flow
  backbone: ResNet
  rpn_head: RPNHead
  bbox_head: BBoxHead
@@ -26,88 +20,69 @@ ResNet:
  num_stages: 3
 RPNHead:
-  rpn_feat:
-    name: RPNFeat
-    feat_in: 1024
-    feat_out: 1024
-  anchor_per_position: 15
-Anchor:
  anchor_generator:
-    name: AnchorGeneratorRPN
-    anchor_sizes: [32, 64, 128, 256, 512]
    aspect_ratios: [0.5, 1.0, 2.0]
-    stride: [16.0, 16.0]
+    anchor_sizes: [32, 64, 128, 256, 512]
-    variance: [1.0, 1.0, 1.0, 1.0]
+    strides: [16]
-  anchor_target_generator:
+  rpn_target_assign:
-    name: AnchorTargetGeneratorRPN
    batch_size_per_im: 256
    fg_fraction: 0.5
    negative_overlap: 0.3
    positive_overlap: 0.7
-    straddle_thresh: 0.0
+    use_random: True
+  train_proposal:
-Proposal:
+    min_size: 0.0
-  proposal_generator:
+    nms_thresh: 0.7
-    name: ProposalGenerator
+    pre_nms_top_n: 12000
+    post_nms_top_n: 2000
+    topk_after_collect: True
+  test_proposal:
    min_size: 0.0
    nms_thresh: 0.7
-    train_pre_nms_top_n: 12000
+    pre_nms_top_n: 6000
-    train_post_nms_top_n: 2000
+    post_nms_top_n: 1000
-    infer_pre_nms_top_n: 6000
-    infer_post_nms_top_n: 1000
-  proposal_target_generator:
+BBoxHead:
-    name: ProposalTargetGenerator
+  head: Res5Head
+  roi_extractor:
+    resolution: 14
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: BBoxAssigner
+  with_pool: true
+BBoxAssigner:
  batch_size_per_im: 512
-    bbox_reg_weights: [0.1, 0.1, 0.2, 0.2]
+  bg_thresh: [0.5,]
-    bg_thresh_hi: [0.5,]
-    bg_thresh_lo: [0.0,]
  fg_thresh: [0.5,]
  fg_fraction: 0.25
+  use_random: True
-BBoxHead:
-  bbox_feat:
-    name: BBoxFeat
-    roi_extractor: RoIAlign
-    head_feat:
-      name: Res5Head
-      feat_in: 1024
-      feat_out: 512
-  with_pool: true
-  in_feat: 2048
 BBoxPostProcess:
-  decode:
+  decode: RCNNBox
-    name: RCNNBox
-    num_classes: 81
-    batch_size: 1
  nms:
    name: MultiClassNMS
    keep_top_k: 100
    score_threshold: 0.05
    nms_threshold: 0.5
+    normalized: true
-Mask:
+MaskHead:
-  mask_target_generator:
+  head: MaskFeat
-    name: MaskTargetGenerator
+  roi_extractor:
-    mask_resolution: 14
-RoIAlign:
    resolution: 14
    sampling_ratio: 0
-  start_level: 0
+    aligned: True
-  end_level: 0
+  mask_assigner: MaskAssigner
-MaskHead:
-  mask_feat:
-    name: MaskFeat
-    num_convs: 0
-    feat_in: 2048
-    feat_out: 256
-    mask_roi_extractor: RoIAlign
  share_bbox_feat: true
-  feat_in: 256
+MaskFeat:
+  out_channels: 256
-MaskPostProcess:
+MaskAssigner:
  mask_resolution: 14
+MaskPostProcess:
+  binary_thresh: 0.5
--- a/dygraph/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
+++ b/dygraph/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
@@ -2,13 +2,7 @@ architecture: MaskRCNN
 pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar
 load_static_weights: True
-# Model Achitecture
 MaskRCNN:
-  # model anchor info flow
-  anchor: Anchor
-  proposal: Proposal
-  mask: Mask
-  # model feat info flow
  backbone: ResNet
  neck: FPN
  rpn_head: RPNHead
@@ -27,94 +21,73 @@ ResNet:
  num_stages: 4
 FPN:
-  in_channels: [256, 512, 1024, 2048]
  out_channel: 256
-  min_level: 0
-  max_level: 4
-  spatial_scale: [0.25, 0.125, 0.0625, 0.03125]
 RPNHead:
-  rpn_feat:
-    name: RPNFeat
-    feat_in: 256
-    feat_out: 256
-  anchor_per_position: 3
-  rpn_channel: 256
-Anchor:
  anchor_generator:
-    name: AnchorGeneratorRPN
    aspect_ratios: [0.5, 1.0, 2.0]
-    anchor_start_size: 32
+    anchor_sizes: [[32], [64], [128], [256], [512]]
-    stride: [4., 4.]
+    strides: [4, 8, 16, 32, 64]
-  anchor_target_generator:
+  rpn_target_assign:
-    name: AnchorTargetGeneratorRPN
    batch_size_per_im: 256
    fg_fraction: 0.5
    negative_overlap: 0.3
    positive_overlap: 0.7
-    straddle_thresh: 0.0
+    use_random: True
+  train_proposal:
-Proposal:
-  proposal_generator:
-    name: ProposalGenerator
    min_size: 0.0
    nms_thresh: 0.7
-    train_pre_nms_top_n: 2000
+    pre_nms_top_n: 2000
-    train_post_nms_top_n: 2000
+    post_nms_top_n: 1000
-    infer_pre_nms_top_n: 1000
+    topk_after_collect: True
-    infer_post_nms_top_n: 1000
+  test_proposal:
-  proposal_target_generator:
+    min_size: 0.0
-    name: ProposalTargetGenerator
+    nms_thresh: 0.7
-    batch_size_per_im: 512
+    pre_nms_top_n: 1000
-    bbox_reg_weights: [0.1, 0.1, 0.2, 0.2]
+    post_nms_top_n: 1000
-    bg_thresh_hi: [0.5,]
-    bg_thresh_lo: [0.0,]
-    fg_thresh: [0.5,]
-    fg_fraction: 0.25
 BBoxHead:
-  bbox_feat:
+  head: TwoFCHead
-    name: BBoxFeat
  roi_extractor:
-      name: RoIAlign
    resolution: 7
-      sampling_ratio: 2
+    sampling_ratio: 0
-    head_feat:
+    aligned: True
-      name: TwoFCHead
+  bbox_assigner: BBoxAssigner
-      in_dim: 256
+BBoxAssigner:
+  batch_size_per_im: 512
+  bg_thresh: [0.5,]
+  fg_thresh: [0.5,]
+  fg_fraction: 0.25
+  use_random: True
+TwoFCHead:
  mlp_dim: 1024
-  in_feat: 1024
 BBoxPostProcess:
-  decode:
+  decode: RCNNBox
-    name: RCNNBox
-    num_classes: 81
-    batch_size: 1
  nms:
    name: MultiClassNMS
    keep_top_k: 100
    score_threshold: 0.05
    nms_threshold: 0.5
+    normalized: true
-Mask:
-  mask_target_generator:
-    name: MaskTargetGenerator
-    mask_resolution: 28
 MaskHead:
-  mask_feat:
+  head: MaskFeat
-    name: MaskFeat
+  roi_extractor:
-    num_convs: 4
-    feat_in: 256
-    feat_out: 256
-    mask_roi_extractor:
-      name: RoIAlign
    resolution: 14
-      sampling_ratio: 2
+    sampling_ratio: 0
+    aligned: True
+  mask_assigner: MaskAssigner
  share_bbox_feat: False
-  feat_in: 256
+MaskFeat:
+  num_convs: 4
+  out_channels: 256
-MaskPostProcess:
+MaskAssigner:
  mask_resolution: 28
+MaskPostProcess:
+  binary_thresh: 0.5
--- a/dygraph/configs/mask_rcnn/_base_/mask_reader.yml
+++ b/dygraph/configs/mask_rcnn/_base_/mask_reader.yml
@@ -2,12 +2,12 @@ worker_num: 2
 TrainReader:
  sample_transforms:
  - DecodeOp: {}
-  - RandomFlipImage: {prob: 0.5, is_mask_flip: true}
+  - RandomResizeOp: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True}
-  - NormalizeImage: {is_channel_first: false, is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - RandomFlipOp: {prob: 0.5, is_mask_flip: true}
-  - ResizeImage: {target_size: 800, max_size: 1333, interp: 1, use_cv2: true}
+  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - Permute: {to_bgr: false, channel_first: true}
+  - PermuteOp: {}
  batch_transforms:
-  - PadBatch: {pad_to_stride: -1., use_padded_im_info: false, pad_gt: true}
+  - PadBatchOp: {pad_to_stride: -1., pad_gt: true}
  batch_size: 1
  shuffle: true
  drop_last: true
@@ -16,8 +16,8 @@ TrainReader:
 EvalReader:
  sample_transforms:
  - DecodeOp: {}
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - ResizeOp: {interp: 1, target_size: [800, 1333], keep_ratio: True}
  - PermuteOp: {}
  batch_transforms:
  - PadBatchOp: {pad_to_stride: -1., pad_gt: false}
@@ -30,8 +30,8 @@ EvalReader:
 TestReader:
  sample_transforms:
  - DecodeOp: {}
+  - ResizeOp: {interp: 2, target_size: [800, 1333], keep_ratio: True}
  - NormalizeImageOp: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
-  - ResizeOp: {interp: 1, target_size: [800, 1333], keep_ratio: True}
  - PermuteOp: {}
  batch_transforms:
  - PadBatchOp: {pad_to_stride: -1., pad_gt: false}

--- a/dygraph/configs/mask_rcnn/_base_/optimizer_1x.yml
+++ b/dygraph/configs/mask_rcnn/_base_/optimizer_1x.yml
@@ -7,8 +7,8 @@ LearningRate:
    gamma: 0.1
    milestones: [8, 11]
  - !LinearWarmup
-    start_factor: 0.3333333333333333
+    start_factor: 0.001
-    steps: 500
+    steps: 1000
 OptimizerBuilder:
  optimizer:

--- a/dygraph/deploy/cpp/include/config_parser.h
+++ b/dygraph/deploy/cpp/include/config_parser.h
@@ -76,13 +76,6 @@ class ConfigPaser {
      std::cerr << "Please set draw_threshold." << std::endl;
      return false;
    }
-    // Get with_background
-    if (config["with_background"].IsDefined()) {
-      with_background_ = config["with_background"].as<bool>();
-    } else {
-      std::cerr << "Please set with_background." << std::endl;
-      return false;
-    }
    // Get Preprocess for preprocessing
    if (config["Preprocess"].IsDefined()) {
      preprocess_info_ = config["Preprocess"];
@@ -111,7 +104,6 @@ class ConfigPaser {
  float draw_threshold_;
  std::string arch_;
  int min_subgraph_size_;
-  bool with_background_;
  YAML::Node preprocess_info_;
  std::vector<std::string> label_list_;
  std::vector<int> image_shape_;

--- a/dygraph/ppdet/core/workspace.py
+++ b/dygraph/ppdet/core/workspace.py
@@ -99,19 +99,6 @@ def _load_config_with_base(file_path):
    return file_cfg
-WITHOUT_BACKGROUND_ARCHS = ['YOLOv3', 'FCOS', 'TTFNet']
-def _parse_with_background():
-    arch = global_config.architecture
-    with_background = arch not in WITHOUT_BACKGROUND_ARCHS
-    global_config['with_background'] = with_background
-    global_config['TrainReader']['with_background'] = with_background
-    global_config['EvalReader']['with_background'] = with_background
-    global_config['TestReader']['with_background'] = with_background
-    global_config['num_classes'] += with_background
 def load_config(file_path):
    """
    Load config from file.
@@ -129,9 +116,6 @@ def load_config(file_path):
    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
    merge_config(cfg)
-    # parse config from merged config
-    _parse_with_background()
    return global_config
@@ -166,7 +150,7 @@ def merge_config(config, another_cfg=None):
    Returns: global config
    """
    global global_config
-    dct = another_cfg if another_cfg is not None else global_config
+    dct = another_cfg or global_config
    return dict_merge(dct, config)
@@ -231,16 +215,13 @@ def create(cls_or_name, **kwargs):
        isinstance(global_config[name], SchemaDict), \
        "the module {} is not registered".format(name)
    config = global_config[name]
-    config.update(kwargs)
-    config.validate()
    cls = getattr(config.pymodule, name)
-    kwargs = {}
+    cls_kwargs = {}
-    kwargs.update(global_config[name])
+    cls_kwargs.update(global_config[name])
    # parse `shared` annoation of registered modules
    if getattr(config, 'shared', None):
        for k in config.shared:
            target_key = config[k]
            shared_conf = config.schema[k].default
            assert isinstance(shared_conf, SharedConfig)
@@ -249,11 +230,14 @@ def create(cls_or_name, **kwargs):
                continue  # value is given for the module
            elif shared_conf.key in global_config:
                # `key` is present in config
-                kwargs[k] = global_config[shared_conf.key]
+                cls_kwargs[k] = global_config[shared_conf.key]
            else:
-                kwargs[k] = shared_conf.default_value
+                cls_kwargs[k] = shared_conf.default_value
    # parse `inject` annoation of registered modules
+    if getattr(cls, 'from_config', None):
+        cls_kwargs.update(cls.from_config(config, **kwargs))
    if getattr(config, 'inject', None):
        for k in config.inject:
            target_key = config[k]
@@ -275,18 +259,18 @@ def create(cls_or_name, **kwargs):
                        continue
                    target[i] = v
                if isinstance(target, SchemaDict):
-                    kwargs[k] = create(inject_name)
+                    cls_kwargs[k] = create(inject_name)
            elif isinstance(target_key, str):
                if target_key not in global_config:
                    raise ValueError("Missing injection config:", target_key)
                target = global_config[target_key]
                if isinstance(target, SchemaDict):
-                    kwargs[k] = create(target_key)
+                    cls_kwargs[k] = create(target_key)
                elif hasattr(target, '__dict__'):  # serialized object
-                    kwargs[k] = target
+                    cls_kwargs[k] = target
            else:
                raise ValueError("Unsupported injection type:", target_key)
    # prevent modification of global config values of reference types
    # (e.g., list, dict) from within the created module instances
    #kwargs = copy.deepcopy(kwargs)
-    return cls(**kwargs)
+    return cls(**cls_kwargs)
--- a/dygraph/ppdet/data/reader.py
+++ b/dygraph/ppdet/data/reader.py
@@ -37,7 +37,7 @@ MAIN_PID = os.getpid()
 class Compose(object):
-    def __init__(self, transforms, num_classes=81):
+    def __init__(self, transforms, num_classes=80):
        self.transforms = transforms
        self.transforms_cls = []
        for t in self.transforms:
@@ -61,7 +61,7 @@ class Compose(object):
 class BatchCompose(Compose):
-    def __init__(self, transforms, num_classes=81):
+    def __init__(self, transforms, num_classes=80):
        super(BatchCompose, self).__init__(transforms, num_classes)
        self.output_fields = mp.Manager().list([])
        self.lock = mp.Lock()
@@ -119,8 +119,7 @@ class BaseDataLoader(object):
                 shuffle=False,
                 drop_last=False,
                 drop_empty=True,
-                 num_classes=81,
+                 num_classes=80,
-                 with_background=True,
                 **kwargs):
        # sample transform
        self._sample_transforms = Compose(
@@ -132,7 +131,6 @@ class BaseDataLoader(object):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last
-        self.with_background = with_background
        self.kwargs = kwargs
    def __call__(self,
@@ -142,7 +140,7 @@ class BaseDataLoader(object):
                 return_list=False,
                 use_prefetch=True):
        self.dataset = dataset
-        self.dataset.parse_dataset(self.with_background)
+        self.dataset.parse_dataset()
        # get data
        self.dataset.set_transform(self._sample_transforms)
        # set kwargs
@@ -204,13 +202,11 @@ class TrainReader(BaseDataLoader):
                 shuffle=True,
                 drop_last=True,
                 drop_empty=True,
-                 num_classes=81,
+                 num_classes=80,
-                 with_background=True,
                 **kwargs):
-        super(TrainReader, self).__init__(inputs_def, sample_transforms,
+        super(TrainReader, self).__init__(
-                                          batch_transforms, batch_size, shuffle,
+            inputs_def, sample_transforms, batch_transforms, batch_size,
-                                          drop_last, drop_empty, num_classes,
+            shuffle, drop_last, drop_empty, num_classes, **kwargs)
-                                          with_background, **kwargs)
 @register
@@ -223,13 +219,11 @@ class EvalReader(BaseDataLoader):
                 shuffle=False,
                 drop_last=True,
                 drop_empty=True,
-                 num_classes=81,
+                 num_classes=80,
-                 with_background=True,
                 **kwargs):
-        super(EvalReader, self).__init__(inputs_def, sample_transforms,
+        super(EvalReader, self).__init__(
-                                         batch_transforms, batch_size, shuffle,
+            inputs_def, sample_transforms, batch_transforms, batch_size,
-                                         drop_last, drop_empty, num_classes,
+            shuffle, drop_last, drop_empty, num_classes, **kwargs)
-                                         with_background, **kwargs)
 @register
@@ -242,10 +236,8 @@ class TestReader(BaseDataLoader):
                 shuffle=False,
                 drop_last=False,
                 drop_empty=True,
-                 num_classes=81,
+                 num_classes=80,
-                 with_background=True,
                 **kwargs):
-        super(TestReader, self).__init__(inputs_def, sample_transforms,
+        super(TestReader, self).__init__(
-                                         batch_transforms, batch_size, shuffle,
+            inputs_def, sample_transforms, batch_transforms, batch_size,
-                                         drop_last, drop_empty, num_classes,
+            shuffle, drop_last, drop_empty, num_classes, **kwargs)
-                                         with_background, **kwargs)
--- a/dygraph/ppdet/data/source/coco.py
+++ b/dygraph/ppdet/data/source/coco.py
@@ -35,7 +35,7 @@ class COCODataSet(DetDataset):
        self.load_image_only = False
        self.load_semantic = False
-    def parse_dataset(self, with_background=True):
+    def parse_dataset(self):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)
@@ -44,16 +44,12 @@ class COCODataSet(DetDataset):
        from pycocotools.coco import COCO
        coco = COCO(anno_path)
        img_ids = coco.getImgIds()
+        img_ids.sort()
        cat_ids = coco.getCatIds()
        records = []
        ct = 0
-        # when with_background = True, mapping category to classid, like:
+        catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
-        #   background:0, first_class:1, second_class:2, ...
-        catid2clsid = dict({
-            catid: i + int(with_background)
-            for i, catid in enumerate(cat_ids)
-        })
        cname2cid = dict({
            coco.loadCats(catid)[0]['name']: clsid
            for catid, clsid in catid2clsid.items()
@@ -95,13 +91,14 @@ class COCODataSet(DetDataset):
                    else:
                        if not any(np.array(inst['bbox'])):
                            continue
-                    x, y, box_w, box_h = inst['bbox']
+                    x1, y1, box_w, box_h = inst['bbox']
-                    x1 = max(0, x)
+                    x2 = x1 + box_w
-                    y1 = max(0, y)
+                    y2 = y1 + box_h
-                    x2 = min(im_w - 1, x1 + max(0, box_w - 1))
+                    eps = 1e-5
-                    y2 = min(im_h - 1, y1 + max(0, box_h - 1))
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
-                    if inst['area'] > 0 and x2 >= x1 and y2 >= y1:
+                        inst['clean_bbox'] = [
-                        inst['clean_bbox'] = [x1, y1, x2, y2]
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
                        bboxes.append(inst)
                    else:
                        logger.warning(

--- a/dygraph/ppdet/data/source/dataset.py
+++ b/dygraph/ppdet/data/source/dataset.py
@@ -78,7 +78,7 @@ class DetDataset(Dataset):
    def set_epoch(self, epoch_id):
        self._epoch = epoch_id
-    def parse_dataset(self, with_background=True):
+    def parse_dataset(self, ):
        raise NotImplemented(
            "Need to implement parse_dataset method of Dataset")
@@ -115,13 +115,17 @@ class ImageFolder(DetDataset):
                 sample_num=-1,
                 use_default_label=None,
                 **kwargs):
-        super(ImageFolder, self).__init__(dataset_dir, image_dir, anno_path,
+        super(ImageFolder, self).__init__(
-                                          sample_num, use_default_label)
+            dataset_dir,
+            image_dir,
+            anno_path,
+            sample_num=sample_num,
+            use_default_label=use_default_label)
        self._imid2path = {}
        self.roidbs = None
        self.sample_num = sample_num
-    def parse_dataset(self, with_background=True):
+    def parse_dataset(self, ):
        if not self.roidbs:
            self.roidbs = self._load_images()

--- a/dygraph/ppdet/data/source/voc.py
+++ b/dygraph/ppdet/data/source/voc.py
@@ -58,14 +58,11 @@ class VOCDataSet(DetDataset):
            sample_num=sample_num)
        self.label_list = label_list
-    def parse_dataset(self, with_background=True):
+    def parse_dataset(self, ):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)
        # mapping category name to class id
-        # if with_background is True:
-        #   background:0, first_class:1, second_class:2, ...
-        # if with_background is False:
        # first_class:0, second_class:1, ...
        records = []
        ct = 0
@@ -76,12 +73,12 @@ class VOCDataSet(DetDataset):
                raise ValueError("label_list {} does not exists".format(
                    label_path))
            with open(label_path, 'r') as fr:
-                label_id = int(with_background)
+                label_id = 0
                for line in fr.readlines():
                    cname2cid[line.strip()] = label_id
                    label_id += 1
        else:
-            cname2cid = pascalvoc_label(with_background)
+            cname2cid = pascalvoc_label()
        with open(anno_path, 'r') as fr:
            while True:
@@ -175,29 +172,27 @@ class VOCDataSet(DetDataset):
        return os.path.join(self.dataset_dir, self.label_list)
-def pascalvoc_label(with_background=True):
+def pascalvoc_label():
    labels_map = {
-        'aeroplane': 1,
+        'aeroplane': 0,
-        'bicycle': 2,
+        'bicycle': 1,
-        'bird': 3,
+        'bird': 2,
-        'boat': 4,
+        'boat': 3,
-        'bottle': 5,
+        'bottle': 4,
-        'bus': 6,
+        'bus': 5,
-        'car': 7,
+        'car': 6,
-        'cat': 8,
+        'cat': 7,
-        'chair': 9,
+        'chair': 8,
-        'cow': 10,
+        'cow': 9,
-        'diningtable': 11,
+        'diningtable': 10,
-        'dog': 12,
+        'dog': 11,
-        'horse': 13,
+        'horse': 12,
-        'motorbike': 14,
+        'motorbike': 13,
-        'person': 15,
+        'person': 14,
-        'pottedplant': 16,
+        'pottedplant': 15,
-        'sheep': 17,
+        'sheep': 16,
-        'sofa': 18,
+        'sofa': 17,
-        'train': 19,
+        'train': 18,
-        'tvmonitor': 20
+        'tvmonitor': 19
    }
-    if not with_background:
-        labels_map = {k: v - 1 for k, v in labels_map.items()}
    return labels_map
--- a/dygraph/ppdet/data/source/widerface.py
+++ b/dygraph/ppdet/data/source/widerface.py
@@ -52,7 +52,7 @@ class WIDERFaceDataSet(DataSet):
        self.cname2cid = None
        self.with_lmk = with_lmk
-    def load_roidb_and_cname2cid(self, with_background=True):
+    def load_roidb_and_cname2cid(self, ):
        anno_path = os.path.join(self.dataset_dir, self.anno_path)
        image_dir = os.path.join(self.dataset_dir, self.image_dir)
@@ -61,7 +61,7 @@ class WIDERFaceDataSet(DataSet):
        records = []
        ct = 0
        file_lists = self._load_file_list(txt_file)
-        cname2cid = widerface_label(with_background)
+        cname2cid = widerface_label()
        for item in file_lists:
            im_fname = item[0]
@@ -159,8 +159,6 @@ class WIDERFaceDataSet(DataSet):
        return list(file_dict.values())
-def widerface_label(with_background=True):
+def widerface_label():
-    labels_map = {'face': 1}
+    labels_map = {'face': 0}
-    if not with_background:
-        labels_map = {k: v - 1 for k, v in labels_map.items()}
    return labels_map
--- a/dygraph/ppdet/data/transform/operator.py
+++ b/dygraph/ppdet/data/transform/operator.py
@@ -500,7 +500,7 @@ class RandomFlipOp(BaseOperator):
    def apply_segm(self, segms, height, width):
        def _flip_poly(poly, width):
            flipped_poly = np.array(poly)
-            flipped_poly[0::2] = width - np.array(poly[0::2]) - 1
+            flipped_poly[0::2] = width - np.array(poly[0::2])
            return flipped_poly.tolist()
        def _flip_rle(rle, height, width):
@@ -526,7 +526,7 @@ class RandomFlipOp(BaseOperator):
        for i in range(gt_keypoint.shape[1]):
            if i % 2 == 0:
                old_x = gt_keypoint[:, i].copy()
-                gt_keypoint[:, i] = width - old_x - 1
+                gt_keypoint[:, i] = width - old_x
        return gt_keypoint
    def apply_image(self, image):
@@ -535,8 +535,8 @@ class RandomFlipOp(BaseOperator):
    def apply_bbox(self, bbox, width):
        oldx1 = bbox[:, 0].copy()
        oldx2 = bbox[:, 2].copy()
-        bbox[:, 0] = width - oldx2 - 1
+        bbox[:, 0] = width - oldx2
-        bbox[:, 2] = width - oldx1 - 1
+        bbox[:, 2] = width - oldx1
        return bbox
    def apply(self, sample, context=None):
@@ -601,6 +601,7 @@ class ResizeOp(BaseOperator):
    def apply_image(self, image, scale):
        im_scale_x, im_scale_y = scale
        return cv2.resize(
            image,
            None,
@@ -614,8 +615,8 @@ class ResizeOp(BaseOperator):
        resize_w, resize_h = size
        bbox[:, 0::2] *= im_scale_x
        bbox[:, 1::2] *= im_scale_y
-        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w - 1)
+        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
-        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h - 1)
+        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
        return bbox
    def apply_segm(self, segms, im_size, scale):

--- a/dygraph/ppdet/engine/export_utils.py
+++ b/dygraph/ppdet/engine/export_utils.py
@@ -43,9 +43,8 @@ def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
    preprocess_list = []
    anno_file = dataset_cfg.get_anno()
-    with_background = reader_cfg['with_background']
-    clsid2catid, catid2name = get_categories(metric, anno_file, with_background)
+    clsid2catid, catid2name = get_categories(metric, anno_file)
    label_list = [str(cat) for cat in catid2name.values()]
@@ -73,7 +72,7 @@ def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
                    })
                    break
-    return with_background, preprocess_list, label_list, image_shape
+    return preprocess_list, label_list, image_shape
 def _dump_infer_config(config, path, image_shape, model):
@@ -102,7 +101,7 @@ def _dump_infer_config(config, path, image_shape, model):
    if 'mask_post_process' in model.__dict__ and model.__dict__[
            'mask_post_process']:
        infer_cfg['mask_resolution'] = model.mask_post_process.mask_resolution
-    infer_cfg['with_background'], infer_cfg['Preprocess'], infer_cfg[
+    infer_cfg['Preprocess'], infer_cfg[
        'label_list'], image_shape = _parse_reader(
            config['TestReader'], config['TestDataset'], config['metric'],
            infer_cfg['arch'], image_shape)

--- a/dygraph/ppdet/engine/trainer.py
+++ b/dygraph/ppdet/engine/trainer.py
@@ -97,19 +97,11 @@ class Trainer(object):
    def _init_metrics(self):
        if self.mode == 'eval':
            if self.cfg.metric == 'COCO':
-                mask_resolution = self.model.mask_post_process.mask_resolution if getattr(
+                self._metrics = [COCOMetric(anno_file=self.dataset.get_anno())]
-                    self.model, 'mask_post_process', None) else None
-                self._metrics = [
-                    COCOMetric(
-                        anno_file=self.dataset.get_anno(),
-                        with_background=self.cfg.with_background,
-                        mask_resolution=mask_resolution)
-                ]
            elif self.cfg.metric == 'VOC':
                self._metrics = [
                    VOCMetric(
                        anno_file=self.dataset.get_anno(),
-                        with_background=self.cfg.with_background,
                        class_num=self.cfg.num_classes,
                        map_type=self.cfg.map_type)
                ]
@@ -240,9 +232,7 @@ class Trainer(object):
        imid2path = self.dataset.get_imid2path()
        anno_file = self.dataset.get_anno()
-        with_background = self.cfg.with_background
+        clsid2catid, catid2name = get_categories(self.cfg.metric, anno_file)
-        clsid2catid, catid2name = get_categories(self.cfg.metric, anno_file,
-                                                 with_background)
        # Run Infer
        for step_id, data in enumerate(loader):
@@ -255,14 +245,6 @@ class Trainer(object):
            for key, value in outs.items():
                outs[key] = value.numpy()
-            # FIXME: for more elegent coding
-            if 'mask' in outs and 'bbox' in outs:
-                mask_resolution = self.model.mask_post_process.mask_resolution
-                from ppdet.py_op.post_process import mask_post_process
-                outs['mask'] = mask_post_process(outs, outs['im_shape'],
-                                                 outs['scale_factor'],
-                                                 mask_resolution)
            batch_res = get_infer_results(outs, clsid2catid)
            bbox_num = outs['bbox_num']
            start = 0

--- a/dygraph/ppdet/metrics/category.py
+++ b/dygraph/ppdet/metrics/category.py
@@ -25,15 +25,13 @@ logger = setup_logger(__name__)
 __all__ = ['get_categories']
-def get_categories(metric_type, anno_file=None, with_background=True):
+def get_categories(metric_type, anno_file=None):
    """
    Get class id to category id map and category id
    to category name map from annotation file.
    Args:
        anno_file (str): annotation file path
-        with_background (bool, default True):
-            whether load background as class 0.
    """
    if metric_type.lower() == 'coco':
        if anno_file and os.path.isfile(anno_file):
@@ -43,21 +41,14 @@ def get_categories(metric_type, anno_file=None, with_background=True):
            coco = COCO(anno_file)
            cats = coco.loadCats(coco.getCatIds())
-            clsid2catid = {
+            clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
-                i + int(with_background): cat['id']
-                for i, cat in enumerate(cats)
-            }
            catid2name = {cat['id']: cat['name'] for cat in cats}
-            if with_background:
-                clsid2catid.update({0: 0})
-                catid2name.update({0: 'background'})
            return clsid2catid, catid2name
        # anno file not exist, load default categories of COCO17
        else:
-            return _coco17_category(with_background)
+            return _coco17_category()
    elif metric_type.lower() == 'voc':
        if anno_file and os.path.isfile(anno_file):
@@ -66,9 +57,7 @@ def get_categories(metric_type, anno_file=None, with_background=True):
                for line in f.readlines():
                    cats.append(line.strip())
-            if cats[0] != 'background' and with_background:
+            if cats[0] == 'background':
-                cats.insert(0, 'background')
-            if cats[0] == 'background' and not with_background:
                cats = cats[1:]
            clsid2catid = {i: i for i in range(len(cats))}
@@ -79,25 +68,22 @@ def get_categories(metric_type, anno_file=None, with_background=True):
        # anno file not exist, load default categories of
        # VOC all 20 categories
        else:
-            return _vocall_category(with_background)
+            return _vocall_category()
    elif metric_type.lower() == 'oid':
        if anno_file and os.path.isfile(anno_file):
            logger.warn("only default categories support for OID19")
-        return _oid19_category(with_background)
+        return _oid19_category()
    else:
        raise ValueError("unknown metric type {}".format(metric_type))
-def _coco17_category(with_background=True):
+def _coco17_category():
    """
    Get class id to category id map and category id
    to category name map of COCO2017 dataset
-    Args:
-        with_background (bool, default True):
-            whether load background as class 0.
    """
    clsid2catid = {
        1: 1,
@@ -266,39 +252,30 @@ def _coco17_category(with_background=True):
        90: 'toothbrush'
    }
-    if not with_background:
    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
    catid2name.pop(0)
-    else:
-        clsid2catid.update({0: 0})
    return clsid2catid, catid2name
-def _vocall_category(with_background=True):
+def _vocall_category():
    """
    Get class id to category id map and category id
    to category name map of mixup voc dataset
-    Args:
-        with_background (bool, default True):
-            whether load background as class 0.
    """
-    label_map = pascalvoc_label(with_background)
+    label_map = pascalvoc_label()
    label_map = sorted(label_map.items(), key=lambda x: x[1])
    cats = [l[0] for l in label_map]
-    if with_background:
-        cats.insert(0, 'background')
    clsid2catid = {i: i for i in range(len(cats))}
    catid2name = {i: name for i, name in enumerate(cats)}
    return clsid2catid, catid2name
-def _oid19_category(with_background=True):
+def _oid19_category():
-    clsid2catid = {k: k for k in range(1, 501)}
+    clsid2catid = {k: k + 1 for k in range(500)}
    catid2name = {
        0: "background",
@@ -804,6 +781,4 @@ def _oid19_category(with_background=True):
        500: "Toilet",
    }
-    if not with_background:
-        clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
    return clsid2catid, catid2name
--- a/dygraph/ppdet/metrics/coco_utils.py
+++ b/dygraph/ppdet/metrics/coco_utils.py
@@ -38,17 +38,17 @@ def get_infer_results(outs, catid):
        )
    im_id = outs['im_id']
-    im_shape = outs['im_shape']
-    scale_factor = outs['scale_factor']
    infer_res = {}
    if 'bbox' in outs:
-        infer_res['bbox'] = get_det_res(outs['bbox'], outs['bbox_num'], im_id,
+        infer_res['bbox'] = get_det_res(outs['bbox'], outs['score'],
+                                        outs['label'], outs['bbox_num'], im_id,
                                        catid)
    if 'mask' in outs:
        # mask post process
-        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox_num'], im_id,
+        infer_res['mask'] = get_seg_res(outs['mask'], outs['score'],
+                                        outs['label'], outs['bbox_num'], im_id,
                                        catid)
    if 'segm' in outs:

--- a/dygraph/ppdet/metrics/metrics.py
+++ b/dygraph/ppdet/metrics/metrics.py
@@ -49,14 +49,11 @@ class Metric(paddle.metric.Metric):
 class COCOMetric(Metric):
-    def __init__(self, anno_file, with_background=True, mask_resolution=None):
+    def __init__(self, anno_file):
        assert os.path.isfile(anno_file), \
                "anno_file {} not a file".format(anno_file)
        self.anno_file = anno_file
-        self.with_background = with_background
+        self.clsid2catid, self.catid2name = get_categories('COCO', anno_file)
-        self.mask_resolution = mask_resolution
-        self.clsid2catid, self.catid2name = get_categories('COCO', anno_file,
-                                                           with_background)
        self.reset()
@@ -71,16 +68,9 @@ class COCOMetric(Metric):
        for k, v in outputs.items():
            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
-        # some input fields also needed
+        im_id = inputs['im_id']
-        for k in ['im_id', 'scale_factor', 'im_shape']:
+        outs['im_id'] = im_id.numpy() if isinstance(im_id,
-            v = inputs[k]
+                                                    paddle.Tensor) else im_id
-            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
-        if 'mask' in outs and 'bbox' in outs:
-            from ppdet.py_op.post_process import mask_post_process
-            outs['mask'] = mask_post_process(outs, outs['im_shape'],
-                                             outs['scale_factor'],
-                                             self.mask_resolution)
        infer_results = get_infer_results(outs, self.clsid2catid)
        self.results['bbox'] += infer_results[
@@ -131,7 +121,6 @@ class COCOMetric(Metric):
 class VOCMetric(Metric):
    def __init__(self,
                 anno_file,
-                 with_background=True,
                 class_num=20,
                 overlap_thresh=0.5,
                 map_type='11point',
@@ -140,9 +129,7 @@ class VOCMetric(Metric):
        assert os.path.isfile(anno_file), \
                "anno_file {} not a file".format(anno_file)
        self.anno_file = anno_file
-        self.with_background = with_background
+        self.clsid2catid, self.catid2name = get_categories('VOC', anno_file)
-        self.clsid2catid, self.catid2name = get_categories('VOC', anno_file,
-                                                           with_background)
        self.overlap_thresh = overlap_thresh
        self.map_type = map_type

--- a/dygraph/ppdet/modeling/__init__.py
+++ b/dygraph/ppdet/modeling/__init__.py
 from . import ops
-from . import bbox
-from . import mask
 from . import backbones
 from . import necks
+from . import proposal_generator
 from . import heads
 from . import losses
 from . import architectures
@@ -11,10 +10,9 @@ from . import layers
 from . import utils
 from .ops import *
-from .bbox import *
-from .mask import *
 from .backbones import *
 from .necks import *
+from .proposal_generator import *
 from .heads import *
 from .losses import *
 from .architectures import *

--- a/dygraph/ppdet/modeling/architectures/faster_rcnn.py
+++ b/dygraph/ppdet/modeling/architectures/faster_rcnn.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import paddle
-from ppdet.core.workspace import register
+from ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 __all__ = ['FasterRCNN']
@@ -12,91 +26,86 @@ __all__ = ['FasterRCNN']
 @register
 class FasterRCNN(BaseArch):
    __category__ = 'architecture'
-    __inject__ = [
+    __inject__ = ['bbox_post_process']
-        'anchor', 'proposal', 'backbone', 'neck', 'rpn_head', 'bbox_head',
-        'bbox_post_process'
-    ]
    def __init__(self,
-                 anchor,
-                 proposal,
                 backbone,
                 rpn_head,
                 bbox_head,
                 bbox_post_process,
                 neck=None):
+        """
+        backbone (nn.Layer): backbone instance.
+        rpn_head (nn.Layer): generates proposals using backbone features.
+        bbox_head (nn.Layer): a head that performs per-region computation.
+        mask_head (nn.Layer): generates mask from bbox and backbone features.
+        """
        super(FasterRCNN, self).__init__()
-        self.anchor = anchor
-        self.proposal = proposal
        self.backbone = backbone
+        self.neck = neck
        self.rpn_head = rpn_head
        self.bbox_head = bbox_head
        self.bbox_post_process = bbox_post_process
-        self.neck = neck
-    def model_arch(self):
+    @classmethod
-        # Backbone
+    def from_config(cls, cfg, *args, **kwargs):
-        body_feats = self.backbone(self.inputs)
+        backbone = create(cfg['backbone'])
-        spatial_scale = 0.0625
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+        }
-        # Neck
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
        if self.neck is not None:
-            body_feats, spatial_scale = self.neck(body_feats)
+            body_feats = self.neck(body_feats)
+        if self.training:
-        # RPN
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
-        # rpn_head returns two list: rpn_feat, rpn_head_out
+            bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,
-        # each element in rpn_feats contains rpn feature on each level,
+                                          self.inputs)
-        # and the length is 1 when the neck is not applied.
+            return rpn_loss, bbox_loss
-        # each element in rpn_head_out contains (rpn_rois_score, rpn_rois_delta)
+        else:
-        rpn_feat, self.rpn_head_out = self.rpn_head(self.inputs, body_feats)
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
-        # Anchor
-        # anchor_out returns a list,
-        # each element contains (anchor, anchor_var)
-        self.anchor_out = self.anchor(rpn_feat)
-        # Proposal RoI
+            im_shape = self.inputs['im_shape']
-        # compute targets here when training
+            scale_factor = self.inputs['scale_factor']
-        rois = self.proposal(self.inputs, self.rpn_head_out, self.anchor_out,
+            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
-                             self.training)
+                                                    im_shape, scale_factor)
-        # BBox Head
-        bbox_feat, self.bbox_head_out, self.bbox_head_feat_func = self.bbox_head(
-            body_feats, rois, spatial_scale)
-        if not self.training:
+            # rescale the prediction back to origin image
-            bbox_pred, bboxes = self.bbox_head.get_prediction(
+            bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
-                self.bbox_head_out, rois)
+                                                        im_shape, scale_factor)
-            # Refine bbox by the output from bbox_head at test stage
+            return bbox_pred, bbox_num
-            self.bboxes = self.bbox_post_process(bbox_pred, bboxes,
-                                                 self.inputs['im_shape'],
-                                                 self.inputs['scale_factor'])
-        else:
-            # Proposal RoI for Mask branch
-            # bboxes update at training stage only
-            bbox_targets = self.proposal.get_targets()[0]
    def get_loss(self, ):
+        rpn_loss, bbox_loss = self._forward()
        loss = {}
+        loss.update(rpn_loss)
-        # RPN loss
+        loss.update(bbox_loss)
-        rpn_loss_inputs = self.anchor.generate_loss_inputs(
-            self.inputs, self.rpn_head_out, self.anchor_out)
-        loss_rpn = self.rpn_head.get_loss(rpn_loss_inputs)
-        loss.update(loss_rpn)
-        # BBox loss
-        bbox_targets = self.proposal.get_targets()
-        loss_bbox = self.bbox_head.get_loss([self.bbox_head_out], bbox_targets)
-        loss.update(loss_bbox)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss
    def get_pred(self):
-        bbox, bbox_num = self.bboxes
+        bbox_pred, bbox_num = self._forward()
+        label = bbox_pred[:, 0]
+        score = bbox_pred[:, 1]
+        bbox = bbox_pred[:, 2:]
        output = {
            'bbox': bbox,
-            'bbox_num': bbox_num,
+            'score': score,
+            'label': label,
+            'bbox_num': bbox_num
        }
        return output
--- a/dygraph/ppdet/modeling/architectures/mask_rcnn.py
+++ b/dygraph/ppdet/modeling/architectures/mask_rcnn.py
@@ -17,7 +17,7 @@ from __future__ import division
 from __future__ import print_function
 import paddle
-from ppdet.core.workspace import register
+from ppdet.core.workspace import register, create
 from .meta_arch import BaseArch
 __all__ = ['MaskRCNN']
@@ -27,22 +27,11 @@ __all__ = ['MaskRCNN']
 class MaskRCNN(BaseArch):
    __category__ = 'architecture'
    __inject__ = [
-        'anchor',
-        'proposal',
-        'mask',
-        'backbone',
-        'neck',
-        'rpn_head',
-        'bbox_head',
-        'mask_head',
        'bbox_post_process',
        'mask_post_process',
    ]
    def __init__(self,
-                 anchor,
-                 proposal,
-                 mask,
                 backbone,
                 rpn_head,
                 bbox_head,
@@ -50,95 +39,99 @@ class MaskRCNN(BaseArch):
                 bbox_post_process,
                 mask_post_process,
                 neck=None):
+        """
+        backbone (nn.Layer): backbone instance.
+        rpn_head (nn.Layer): generates proposals using backbone features.
+        bbox_head (nn.Layer): a head that performs per-region computation.
+        mask_head (nn.Layer): generates mask from bbox and backbone features.
+        """
        super(MaskRCNN, self).__init__()
-        self.anchor = anchor
-        self.proposal = proposal
-        self.mask = mask
        self.backbone = backbone
        self.neck = neck
        self.rpn_head = rpn_head
        self.bbox_head = bbox_head
        self.mask_head = mask_head
        self.bbox_post_process = bbox_post_process
        self.mask_post_process = mask_post_process
-    def model_arch(self):
+    @classmethod
-        # Backbone
+    def from_config(cls, cfg, *args, **kwargs):
-        body_feats = self.backbone(self.inputs)
+        backbone = create(cfg['backbone'])
-        spatial_scale = 1. / 16
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+        out_shape = neck and out_shape or bbox_head.get_head().out_shape
+        kwargs = {'input_shape': out_shape}
+        mask_head = create(cfg['mask_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+            "mask_head": mask_head,
+        }
-        # Neck
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
        if self.neck is not None:
-            body_feats, spatial_scale = self.neck(body_feats)
+            body_feats = self.neck(body_feats)
-        # RPN
+        if self.training:
-        # rpn_head returns two list: rpn_feat, rpn_head_out 
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
-        # each element in rpn_feats contains rpn feature on each level,
+            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
-        # and the length is 1 when the neck is not applied.
+                                                  self.inputs)
-        # each element in rpn_head_out contains (rpn_rois_score, rpn_rois_delta)
+            rois, rois_num = self.bbox_head.get_assigned_rois()
-        rpn_feat, self.rpn_head_out = self.rpn_head(self.inputs, body_feats)
+            bbox_targets = self.bbox_head.get_assigned_targets()
+            # Mask Head needs bbox_feat in Mask RCNN
-        # Anchor
+            mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,
-        # anchor_out returns a list,
+                                       bbox_targets, bbox_feat)
-        # each element contains (anchor, anchor_var)
+            return rpn_loss, bbox_loss, mask_loss
-        self.anchor_out = self.anchor(rpn_feat)
-        # Proposal RoI 
-        # compute targets here when training
-        rois = self.proposal(self.inputs, self.rpn_head_out, self.anchor_out,
-                             self.training)
-        # BBox Head
-        bbox_feat, self.bbox_head_out, bbox_head_feat_func = self.bbox_head(
-            body_feats, rois, spatial_scale)
-        rois_has_mask_int32 = None
-        if not self.training:
-            bbox_pred, bboxes = self.bbox_head.get_prediction(
-                self.bbox_head_out, rois)
-            # Refine bbox by the output from bbox_head at test stage
-            self.bboxes = self.bbox_post_process(bbox_pred, bboxes,
-                                                 self.inputs['im_shape'],
-                                                 self.inputs['scale_factor'])
        else:
-            # Proposal RoI for Mask branch
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
-            # bboxes update at training stage only
+            preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)
-            bbox_targets = self.proposal.get_targets()[0]
-            self.bboxes, rois_has_mask_int32 = self.mask(self.inputs, rois,
-                                                         bbox_targets)
-        # Mask Head 
-        self.mask_head_out = self.mask_head(
-            self.inputs, body_feats, self.bboxes, bbox_feat,
-            rois_has_mask_int32, spatial_scale, bbox_head_feat_func)
-    def get_loss(self, ):
+            im_shape = self.inputs['im_shape']
-        loss = {}
+            scale_factor = self.inputs['scale_factor']
-        # RPN loss
-        rpn_loss_inputs = self.anchor.generate_loss_inputs(
-            self.inputs, self.rpn_head_out, self.anchor_out)
-        loss_rpn = self.rpn_head.get_loss(rpn_loss_inputs)
-        loss.update(loss_rpn)
-        # BBox loss
+            bbox, bbox_num = self.bbox_post_process(preds, (rois, rois_num),
-        bbox_targets = self.proposal.get_targets()
+                                                    im_shape, scale_factor)
-        loss_bbox = self.bbox_head.get_loss([self.bbox_head_out], bbox_targets)
+            mask_out = self.mask_head(
-        loss.update(loss_bbox)
+                body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
-        # Mask loss
+            # rescale the prediction back to origin image
-        mask_targets = self.mask.get_targets()
+            bbox_pred = self.bbox_post_process.get_pred(bbox, bbox_num,
-        loss_mask = self.mask_head.get_loss(self.mask_head_out, mask_targets)
+                                                        im_shape, scale_factor)
-        loss.update(loss_mask)
+            origin_shape = self.bbox_post_process.get_origin_shape()
+            mask_pred = self.mask_post_process(mask_out[:, 0, :, :], bbox_pred,
+                                               bbox_num, origin_shape)
+            return bbox_pred, bbox_num, mask_pred
+    def get_loss(self, ):
+        bbox_loss, mask_loss, rpn_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        loss.update(mask_loss)
        total_loss = paddle.add_n(list(loss.values()))
        loss.update({'loss': total_loss})
        return loss
    def get_pred(self):
-        bbox, bbox_num = self.bboxes
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        label = bbox_pred[:, 0]
+        score = bbox_pred[:, 1]
+        bbox = bbox_pred[:, 2:]
        output = {
+            'label': label,
+            'score': score,
            'bbox': bbox,
            'bbox_num': bbox_num,
-            'mask': self.mask_head_out
+            'mask': mask_pred,
        }
        return output
--- a/dygraph/ppdet/modeling/architectures/meta_arch.py
+++ b/dygraph/ppdet/modeling/architectures/meta_arch.py
@@ -31,8 +31,8 @@ class BaseArch(nn.Layer):
            inputs[k] = data[i]
        return inputs
-    def model_arch(self):
+    def model_arch(self, ):
-        raise NotImplementedError("Should implement model_arch method!")
+        pass
    def get_loss(self, ):
        raise NotImplementedError("Should implement get_loss method!")

--- a/dygraph/ppdet/modeling/backbones/resnet.py
+++ b/dygraph/ppdet/modeling/backbones/resnet.py
@@ -13,15 +13,16 @@
 # limitations under the License.
 import math
+from numbers import Integral
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle import ParamAttr
 from ppdet.core.workspace import register, serializable
 from paddle.regularizer import L2Decay
-from .name_adapter import NameAdapter
-from numbers import Integral
 from ppdet.modeling.layers import DeformableConvV2
+from .name_adapter import NameAdapter
+from ..shape_spec import ShapeSpec
 __all__ = ['ResNet', 'Res5Head']
@@ -62,7 +63,7 @@ class ConvNormLayer(nn.Layer):
                stride=stride,
                padding=(filter_size - 1) // 2,
                groups=groups,
-                weight_attr=ParamAttr(
+                weight_attr=paddle.ParamAttr(
                    learning_rate=lr, name=name + "_weights"),
                bias_attr=False)
        else:
@@ -73,19 +74,19 @@ class ConvNormLayer(nn.Layer):
                stride=stride,
                padding=(filter_size - 1) // 2,
                groups=groups,
-                weight_attr=ParamAttr(
+                weight_attr=paddle.ParamAttr(
-                    learning_rate=lr, name=name + "_weights"),
+                    learning_rate=lr, name=name + '_weights'),
                bias_attr=False,
                name=name)
        bn_name = name_adapter.fix_conv_norm_name(name)
        norm_lr = 0. if freeze_norm else lr
-        param_attr = ParamAttr(
+        param_attr = paddle.ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay),
            name=bn_name + "_scale",
            trainable=False if freeze_norm else True)
-        bias_attr = ParamAttr(
+        bias_attr = paddle.ParamAttr(
            learning_rate=norm_lr,
            regularizer=L2Decay(norm_decay),
            name=bn_name + "_offset",
@@ -483,10 +484,12 @@ class ResNet(nn.Layer):
                    lr=1.0,
                    name=_name))
-        self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
        ch_in_list = [64, 256, 512, 1024]
        ch_out_list = [64, 128, 256, 512]
+        self.expansion = 4 if depth >= 50 else 1
+        self._out_channels = [self.expansion * v for v in ch_out_list]
+        self._out_strides = [4, 8, 16, 32]
        self.res_layers = []
        for i in range(num_stages):
@@ -514,10 +517,18 @@ class ResNet(nn.Layer):
                    dcn_v2=(i in self.dcn_v2_stages)))
            self.res_layers.append(res_layer)
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
    def forward(self, inputs):
        x = inputs['image']
        conv1 = self.conv1(x)
-        x = self.pool(conv1)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
        outs = []
        for idx, stage in enumerate(self.res_layers):
            x = stage(x)
@@ -530,16 +541,24 @@ class ResNet(nn.Layer):
 @register
 class Res5Head(nn.Layer):
-    def __init__(self, depth=50, feat_in=1024, feat_out=512):
+    def __init__(self, depth=50):
        super(Res5Head, self).__init__()
+        feat_in, feat_out = [1024, 512]
+        if depth < 50:
+            feat_in = 256
        na = NameAdapter(self)
-        self.res5_conv = []
        self.res5 = self.add_sublayer(
            'res5_roi_feat',
            Blocks(
                depth, feat_in, feat_out, count=3, name_adapter=na,
                stage_num=5))
-        self.feat_out = feat_out * 4
+        self.feat_out = feat_out if depth < 50 else feat_out * 4
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=32, )]
    def forward(self, roi_feat, stage=0):
        y = self.res5(roi_feat)

--- a/dygraph/ppdet/modeling/bbox.py
+++ b/dygraph/ppdet/modeling/bbox.py
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from . import ops
-@register
-class Anchor(object):
-    __inject__ = ['anchor_generator', 'anchor_target_generator']
-    def __init__(self, anchor_generator, anchor_target_generator):
-        super(Anchor, self).__init__()
-        self.anchor_generator = anchor_generator
-        self.anchor_target_generator = anchor_target_generator
-    def __call__(self, rpn_feats):
-        anchors = []
-        num_level = len(rpn_feats)
-        for i, rpn_feat in enumerate(rpn_feats):
-            anchor, var = self.anchor_generator(rpn_feat, i)
-            anchors.append((anchor, var))
-        return anchors
-    def _get_target_input(self, rpn_feats, anchors):
-        rpn_score_list = []
-        rpn_delta_list = []
-        anchor_list = []
-        for (rpn_score, rpn_delta), (anchor, var) in zip(rpn_feats, anchors):
-            rpn_score = paddle.transpose(rpn_score, perm=[0, 2, 3, 1])
-            rpn_delta = paddle.transpose(rpn_delta, perm=[0, 2, 3, 1])
-            rpn_score = paddle.reshape(x=rpn_score, shape=(0, -1, 1))
-            rpn_delta = paddle.reshape(x=rpn_delta, shape=(0, -1, 4))
-            anchor = paddle.reshape(anchor, shape=(-1, 4))
-            var = paddle.reshape(var, shape=(-1, 4))
-            rpn_score_list.append(rpn_score)
-            rpn_delta_list.append(rpn_delta)
-            anchor_list.append(anchor)
-        rpn_scores = paddle.concat(rpn_score_list, axis=1)
-        rpn_deltas = paddle.concat(rpn_delta_list, axis=1)
-        anchors = paddle.concat(anchor_list)
-        return rpn_scores, rpn_deltas, anchors
-    def generate_loss_inputs(self, inputs, rpn_head_out, anchors):
-        if len(rpn_head_out) != len(anchors):
-            raise ValueError(
-                "rpn_head_out and anchors should have same length, "
-                " but received rpn_head_out' length is {} and anchors' "
-                " length is {}".format(len(rpn_head_out), len(anchors)))
-        rpn_score, rpn_delta, anchors = self._get_target_input(rpn_head_out,
-                                                               anchors)
-        score_pred, roi_pred, score_tgt, roi_tgt, roi_weight = self.anchor_target_generator(
-            bbox_pred=rpn_delta,
-            cls_logits=rpn_score,
-            anchor_box=anchors,
-            gt_boxes=inputs['gt_bbox'],
-            is_crowd=inputs['is_crowd'],
-            im_info=inputs['im_info'])
-        outs = {
-            'rpn_score_pred': score_pred,
-            'rpn_score_target': score_tgt,
-            'rpn_rois_pred': roi_pred,
-            'rpn_rois_target': roi_tgt,
-            'rpn_rois_weight': roi_weight
-        }
-        return outs
-@register
-class Proposal(object):
-    __inject__ = ['proposal_generator', 'proposal_target_generator']
-    def __init__(self, proposal_generator, proposal_target_generator):
-        super(Proposal, self).__init__()
-        self.proposal_generator = proposal_generator
-        self.proposal_target_generator = proposal_target_generator
-    def generate_proposal(self, inputs, rpn_head_out, anchor_out, is_train):
-        # TODO: delete im_info 
-        try:
-            im_shape = inputs['im_info']
-        except:
-            im_shape = inputs['im_shape']
-        rpn_rois_list = []
-        rpn_prob_list = []
-        rpn_rois_num_list = []
-        for (rpn_score, rpn_delta), (anchor, var) in zip(rpn_head_out,
-                                                         anchor_out):
-            rpn_prob = F.sigmoid(rpn_score)
-            rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = self.proposal_generator(
-                scores=rpn_prob,
-                bbox_deltas=rpn_delta,
-                anchors=anchor,
-                variances=var,
-                im_shape=im_shape,
-                is_train=is_train)
-            if len(rpn_head_out) == 1:
-                return rpn_rois, rpn_rois_num
-            rpn_rois_list.append(rpn_rois)
-            rpn_prob_list.append(rpn_rois_prob)
-            rpn_rois_num_list.append(rpn_rois_num)
-        start_level = 2
-        end_level = start_level + len(rpn_head_out)
-        rois_collect, rois_num_collect = ops.collect_fpn_proposals(
-            rpn_rois_list,
-            rpn_prob_list,
-            start_level,
-            end_level,
-            post_nms_top_n,
-            rois_num_per_level=rpn_rois_num_list)
-        return rois_collect, rois_num_collect
-    def generate_proposal_target(self,
-                                 inputs,
-                                 rois,
-                                 rois_num,
-                                 stage=0,
-                                 max_overlap=None):
-        outs = self.proposal_target_generator(
-            rpn_rois=rois,
-            rpn_rois_num=rois_num,
-            gt_classes=inputs['gt_class'],
-            is_crowd=inputs['is_crowd'],
-            gt_boxes=inputs['gt_bbox'],
-            im_info=inputs['im_info'],
-            stage=stage,
-            max_overlap=max_overlap)
-        rois = outs[0]
-        max_overlap = outs[-1]
-        rois_num = outs[-2]
-        targets = {
-            'labels_int32': outs[1],
-            'bbox_targets': outs[2],
-            'bbox_inside_weights': outs[3],
-            'bbox_outside_weights': outs[4]
-        }
-        return rois, rois_num, targets, max_overlap
-    def refine_bbox(self, roi, bbox_delta, stage=1):
-        out_dim = bbox_delta.shape[1] // 4
-        bbox_delta_r = paddle.reshape(bbox_delta, (-1, out_dim, 4))
-        bbox_delta_s = paddle.slice(
-            bbox_delta_r, axes=[1], starts=[1], ends=[2])
-        reg_weights = [
-            i / stage for i in self.proposal_target_generator.bbox_reg_weights
-        ]
-        refined_bbox = ops.box_coder(
-            prior_box=roi,
-            prior_box_var=reg_weights,
-            target_box=bbox_delta_s,
-            code_type='decode_center_size',
-            box_normalized=False,
-            axis=1)
-        refined_bbox = paddle.reshape(refined_bbox, shape=[-1, 4])
-        return refined_bbox
-    def __call__(self,
-                 inputs,
-                 rpn_head_out,
-                 anchor_out,
-                 is_train=False,
-                 stage=0,
-                 proposal_out=None,
-                 bbox_head_out=None,
-                 max_overlap=None):
-        if stage == 0:
-            roi, rois_num = self.generate_proposal(inputs, rpn_head_out,
-                                                   anchor_out, is_train)
-            self.targets_list = []
-            self.max_overlap = None
-        else:
-            bbox_delta = bbox_head_out[1]
-            roi = self.refine_bbox(proposal_out[0], bbox_delta, stage)
-            rois_num = proposal_out[1]
-        if is_train:
-            roi, rois_num, targets, self.max_overlap = self.generate_proposal_target(
-                inputs, roi, rois_num, stage, self.max_overlap)
-            self.targets_list.append(targets)
-        return roi, rois_num
-    def get_targets(self):
-        return self.targets_list
-    def get_max_overlap(self):
-        return self.max_overlap
--- a/dygraph/ppdet/modeling/bbox_utils.py
+++ b/dygraph/ppdet/modeling/bbox_utils.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle
+def bbox2delta(src_boxes, tgt_boxes, weights):
+    src_w = src_boxes[:, 2] - src_boxes[:, 0]
+    src_h = src_boxes[:, 3] - src_boxes[:, 1]
+    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
+    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
+    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
+    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
+    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
+    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
+    wx, wy, ww, wh = weights
+    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
+    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
+    dw = ww * paddle.log(tgt_w / src_w)
+    dh = wh * paddle.log(tgt_h / src_h)
+    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
+    return deltas
+def delta2bbox(deltas, boxes, weights):
+    clip_scale = math.log(1000.0 / 16)
+    if boxes.shape[0] == 0:
+        return paddle.zeros((0, deltas.shape[1]), dtype='float32')
+    widths = boxes[:, 2] - boxes[:, 0]
+    heights = boxes[:, 3] - boxes[:, 1]
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] / wx
+    dy = deltas[:, 1::4] / wy
+    dw = deltas[:, 2::4] / ww
+    dh = deltas[:, 3::4] / wh
+    # Prevent sending too large values into np.exp()
+    dw = paddle.clip(dw, max=clip_scale)
+    dh = paddle.clip(dh, max=clip_scale)
+    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
+    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
+    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
+    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
+    pred_boxes = paddle.zeros_like(deltas)
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+    return pred_boxes
+def expand_bbox(bboxes, scale):
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+    w_half *= scale
+    h_half *= scale
+    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
+    bboxes_exp[:, 0] = x_c - w_half
+    bboxes_exp[:, 2] = x_c + w_half
+    bboxes_exp[:, 1] = y_c - h_half
+    bboxes_exp[:, 3] = y_c + h_half
+    return bboxes_exp
+def clip_bbox(boxes, im_shape):
+    h, w = im_shape
+    x1 = boxes[:, 0].clip(0, w)
+    y1 = boxes[:, 1].clip(0, h)
+    x2 = boxes[:, 2].clip(0, w)
+    y2 = boxes[:, 3].clip(0, h)
+    return paddle.stack([x1, y1, x2, y2], axis=1)
+def nonempty_bbox(boxes, min_size=0, return_mask=False):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    mask = paddle.logical_and(w > min_size, w > min_size)
+    if return_mask:
+        return mask
+    keep = paddle.nonzero(mask).flatten()
+    return keep
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+def bbox_overlaps(boxes1, boxes2):
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
--- a/dygraph/ppdet/modeling/heads/__init__.py
+++ b/dygraph/ppdet/modeling/heads/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import rpn_head
 from . import bbox_head
 from . import mask_head
 from . import yolo_head
@@ -22,7 +21,6 @@ from . import fcos_head
 from . import solov2_head
 from . import ttf_head
-from .rpn_head import *
 from .bbox_head import *
 from .mask_head import *
 from .yolo_head import *

--- a/dygraph/ppdet/modeling/heads/bbox_head.py
+++ b/dygraph/ppdet/modeling/heads/bbox_head.py
@@ -13,234 +13,216 @@
 # limitations under the License.
 import paddle
-from paddle import ParamAttr
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.nn import ReLU
 from paddle.nn.initializer import Normal, XavierUniform
 from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register
+from ppdet.core.workspace import register, create
 from ppdet.modeling import ops
+from .roi_extractor import RoIAlign
+from ..shape_spec import ShapeSpec
+from ..bbox_utils import bbox2delta
 @register
 class TwoFCHead(nn.Layer):
+    def __init__(self, in_dim=256, mlp_dim=1024, resolution=7):
-    __shared__ = ['roi_stages']
-    def __init__(self, in_dim=256, mlp_dim=1024, resolution=7, roi_stages=1):
        super(TwoFCHead, self).__init__()
        self.in_dim = in_dim
        self.mlp_dim = mlp_dim
-        self.roi_stages = roi_stages
        fan = in_dim * resolution * resolution
-        self.fc6_list = []
+        lr_factor = 1.
-        self.fc6_relu_list = []
+        self.fc6 = nn.Linear(
-        self.fc7_list = []
-        self.fc7_relu_list = []
-        for stage in range(roi_stages):
-            fc6_name = 'fc6_{}'.format(stage)
-            fc7_name = 'fc7_{}'.format(stage)
-            lr_factor = 2**stage
-            fc6 = self.add_sublayer(
-                fc6_name,
-                nn.Linear(
            in_dim * resolution * resolution,
            mlp_dim,
-                    weight_attr=ParamAttr(
+            weight_attr=paddle.ParamAttr(
                learning_rate=lr_factor,
-                        initializer=XavierUniform(fan_out=fan)),
+                initializer=XavierUniform(fan_out=fan)))
-                    bias_attr=ParamAttr(
-                        learning_rate=2. * lr_factor, regularizer=L2Decay(0.))))
+        self.fc7 = nn.Linear(
-            fc6_relu = self.add_sublayer(fc6_name + 'act', ReLU())
-            fc7 = self.add_sublayer(
-                fc7_name,
-                nn.Linear(
            mlp_dim,
            mlp_dim,
-                    weight_attr=ParamAttr(
+            weight_attr=paddle.ParamAttr(
-                        learning_rate=lr_factor, initializer=XavierUniform()),
+                learning_rate=lr_factor, initializer=XavierUniform()))
-                    bias_attr=ParamAttr(
-                        learning_rate=2. * lr_factor, regularizer=L2Decay(0.))))
-            fc7_relu = self.add_sublayer(fc7_name + 'act', ReLU())
-            self.fc6_list.append(fc6)
-            self.fc6_relu_list.append(fc6_relu)
-            self.fc7_list.append(fc7)
-            self.fc7_relu_list.append(fc7_relu)
-    def forward(self, rois_feat, stage=0):
-        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
-        fc6 = self.fc6_list[stage](rois_feat)
-        fc6_relu = self.fc6_relu_list[stage](fc6)
-        fc7 = self.fc7_list[stage](fc6_relu)
-        fc7_relu = self.fc7_relu_list[stage](fc7)
-        return fc7_relu
-@register
+    @classmethod
-class BBoxFeat(nn.Layer):
+    def from_config(cls, cfg, input_shape):
-    __inject__ = ['roi_extractor', 'head_feat']
+        s = input_shape
+        s = s[0] if isinstance(s, (list, tuple)) else s
+        return {'in_dim': s.channels}
-    def __init__(self, roi_extractor, head_feat):
+    @property
-        super(BBoxFeat, self).__init__()
+    def out_shape(self):
-        self.roi_extractor = roi_extractor
+        return [ShapeSpec(channels=self.mlp_dim, )]
-        self.head_feat = head_feat
-        self.rois_feat_list = []
-    def forward(self, body_feats, rois, spatial_scale, stage=0):
+    def forward(self, rois_feat):
-        rois_feat = self.roi_extractor(body_feats, rois, spatial_scale)
+        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
-        bbox_feat = self.head_feat(rois_feat, stage)
+        fc6 = self.fc6(rois_feat)
-        return rois_feat, bbox_feat
+        fc6 = F.relu(fc6)
+        fc7 = self.fc7(fc6)
+        fc7 = F.relu(fc7)
+        return fc7
 @register
 class BBoxHead(nn.Layer):
-    __shared__ = ['num_classes', 'roi_stages']
+    __shared__ = ['num_classes']
-    __inject__ = ['bbox_feat']
+    __inject__ = ['bbox_assigner']
+    """
+    head (nn.Layer): Extract feature in bbox head
+    in_channel (int): Input channel after RoI extractor
+    """
    def __init__(self,
-                 bbox_feat,
+                 head,
-                 in_feat=1024,
+                 in_channel,
-                 num_classes=81,
+                 roi_extractor=RoIAlign().__dict__,
-                 cls_agnostic=False,
+                 bbox_assigner='BboxAssigner',
-                 roi_stages=1,
                 with_pool=False,
-                 score_stage=[0, 1, 2],
+                 num_classes=80,
-                 delta_stage=[2]):
+                 bbox_weight=[10., 10., 5., 5.]):
        super(BBoxHead, self).__init__()
-        self.num_classes = num_classes
+        self.head = head
-        self.cls_agnostic = cls_agnostic
+        self.roi_extractor = roi_extractor
-        self.delta_dim = 2 if cls_agnostic else num_classes
+        if isinstance(roi_extractor, dict):
-        self.bbox_feat = bbox_feat
+            self.roi_extractor = RoIAlign(**roi_extractor)
-        self.roi_stages = roi_stages
+        self.bbox_assigner = bbox_assigner
-        self.bbox_score_list = []
-        self.bbox_delta_list = []
-        self.roi_feat_list = [[] for i in range(roi_stages)]
        self.with_pool = with_pool
-        self.score_stage = score_stage
+        self.num_classes = num_classes
-        self.delta_stage = delta_stage
+        self.bbox_weight = bbox_weight
-        for stage in range(roi_stages):
-            score_name = 'bbox_score_{}'.format(stage)
+        lr_factor = 1.
-            delta_name = 'bbox_delta_{}'.format(stage)
+        self.bbox_score = nn.Linear(
-            lr_factor = 2**stage
+            in_channel,
-            bbox_score = self.add_sublayer(
+            self.num_classes + 1,
-                score_name,
+            weight_attr=paddle.ParamAttr(
-                nn.Linear(
+                learning_rate=lr_factor, initializer=Normal(
-                    in_feat,
+                    mean=0.0, std=0.01)))
-                    1 * self.num_classes,
-                    weight_attr=ParamAttr(
+        self.bbox_delta = nn.Linear(
-                        learning_rate=lr_factor,
+            in_channel,
-                        initializer=Normal(
+            4 * self.num_classes,
-                            mean=0.0, std=0.01)),
+            weight_attr=paddle.ParamAttr(
-                    bias_attr=ParamAttr(
-                        learning_rate=2. * lr_factor, regularizer=L2Decay(0.))))
-            bbox_delta = self.add_sublayer(
-                delta_name,
-                nn.Linear(
-                    in_feat,
-                    4 * self.delta_dim,
-                    weight_attr=ParamAttr(
                learning_rate=lr_factor,
                initializer=Normal(
-                            mean=0.0, std=0.001)),
+                    mean=0.0, std=0.001)))
-                    bias_attr=ParamAttr(
+        self.assigned_label = None
-                        learning_rate=2. * lr_factor, regularizer=L2Decay(0.))))
+        self.assigned_rois = None
-            self.bbox_score_list.append(bbox_score)
-            self.bbox_delta_list.append(bbox_delta)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
-    def forward(self,
+        roi_pooler = cfg['roi_extractor']
-                body_feats=None,
+        assert isinstance(roi_pooler, dict)
-                rois=None,
+        kwargs = RoIAlign.from_config(cfg, input_shape)
-                spatial_scale=None,
+        roi_pooler.update(kwargs)
-                stage=0,
+        kwargs = {'input_shape': input_shape}
-                roi_stage=-1):
+        head = create(cfg['head'], **kwargs)
-        if rois is not None:
+        return {
-            rois_feat, bbox_feat = self.bbox_feat(body_feats, rois,
+            'roi_extractor': roi_pooler,
-                                                  spatial_scale, stage)
+            'head': head,
-            self.roi_feat_list[stage] = rois_feat
+            'in_channel': head.out_shape[0].channels
+        }
+    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):
+        """
+        body_feats (list[Tensor]):
+        rois (Tensor):
+        rois_num (Tensor):
+        inputs (dict{Tensor}):
+        """
+        if self.training:
+            rois, rois_num, _, targets = self.bbox_assigner(rois, rois_num,
+                                                            inputs)
+            self.assigned_rois = (rois, rois_num)
+            self.assigned_targets = targets
+        rois_feat = self.roi_extractor(body_feats, rois, rois_num)
+        bbox_feat = self.head(rois_feat)
+        #if self.with_pool:
+        if len(bbox_feat.shape) > 2 and bbox_feat.shape[-1] > 1:
+            feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1)
+            feat = paddle.squeeze(feat, axis=[2, 3])
        else:
-            rois_feat = self.roi_feat_list[roi_stage]
+            feat = bbox_feat
-            bbox_feat = self.bbox_feat.head_feat(rois_feat, stage)
+        scores = self.bbox_score(feat)
-        if self.with_pool:
+        deltas = self.bbox_delta(feat)
-            bbox_feat_ = F.adaptive_avg_pool2d(bbox_feat, output_size=1)
-            bbox_feat_ = paddle.squeeze(bbox_feat_, axis=[2, 3])
+        if self.training:
-            scores = self.bbox_score_list[stage](bbox_feat_)
+            loss = self.get_loss(scores, deltas, targets, rois)
-            deltas = self.bbox_delta_list[stage](bbox_feat_)
+            return loss, bbox_feat
        else:
-            scores = self.bbox_score_list[stage](bbox_feat)
+            pred = self.get_prediction(scores, deltas)
-            deltas = self.bbox_delta_list[stage](bbox_feat)
+            return pred, self.head
-        bbox_head_out = (scores, deltas)
-        return bbox_feat, bbox_head_out, self.bbox_feat.head_feat
+    def get_loss(self, scores, deltas, targets, rois):
+        """
-    def _get_head_loss(self, score, delta, target):
+        scores (Tensor): scores from bbox head outputs
-        # bbox cls  
+        deltas (Tensor): deltas from bbox head outputs
-        labels_int64 = paddle.cast(x=target['labels_int32'], dtype='int64')
+        targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds
-        labels_int64.stop_gradient = True
+        rois (List[Tensor]): RoIs generated in each batch
-        loss_bbox_cls = F.softmax_with_cross_entropy(
+        """
-            logits=score, label=labels_int64)
+        # TODO: better pass args
-        loss_bbox_cls = paddle.mean(loss_bbox_cls)
+        tgt_labels, tgt_bboxes, tgt_gt_inds = targets
+        tgt_labels = paddle.concat(tgt_labels) if len(
+            tgt_labels) > 1 else tgt_labels[0]
+        tgt_labels = tgt_labels.cast('int64')
+        tgt_labels.stop_gradient = True
+        loss_bbox_cls = F.cross_entropy(
+            input=scores, label=tgt_labels, reduction='mean')
        # bbox reg
-        loss_bbox_reg = ops.smooth_l1(
-            input=delta,
+        cls_agnostic_bbox_reg = deltas.shape[1] == 4
-            label=target['bbox_targets'],
-            inside_weight=target['bbox_inside_weights'],
+        fg_inds = paddle.nonzero(
-            outside_weight=target['bbox_outside_weights'],
+            paddle.logical_and(tgt_labels >= 0, tgt_labels <
-            sigma=1.0)
+                               self.num_classes)).flatten()
-        loss_bbox_reg = paddle.mean(loss_bbox_reg)
-        return loss_bbox_cls, loss_bbox_reg
+        if cls_agnostic_bbox_reg:
+            reg_delta = paddle.gather(deltas, fg_inds)
-    def get_loss(self, bbox_head_out, targets):
+        else:
-        loss_bbox = {}
+            fg_gt_classes = paddle.gather(tgt_labels, fg_inds)
+            reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1)
+            reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1])
+            reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4)
+            reg_col_inds = reg_col_inds.reshape([-1, 1])
+            reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1)
+            reg_delta = paddle.gather(deltas, fg_inds)
+            reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4])
+        rois = paddle.concat(rois) if len(rois) > 1 else rois[0]
+        tgt_bboxes = paddle.concat(tgt_bboxes) if len(
+            tgt_bboxes) > 1 else tgt_bboxes[0]
+        reg_target = bbox2delta(rois, tgt_bboxes, self.bbox_weight)
+        reg_target = paddle.gather(reg_target, fg_inds)
+        reg_target.stop_gradient = True
+        loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum(
+        ) / tgt_labels.shape[0]
        cls_name = 'loss_bbox_cls'
        reg_name = 'loss_bbox_reg'
-        for lvl, (bboxhead, target) in enumerate(zip(bbox_head_out, targets)):
+        loss_bbox = {}
-            score, delta = bboxhead
+        loss_bbox[cls_name] = loss_bbox_cls
-            if len(targets) > 1:
+        loss_bbox[reg_name] = loss_bbox_reg
-                cls_name = 'loss_bbox_cls_{}'.format(lvl)
-                reg_name = 'loss_bbox_reg_{}'.format(lvl)
-            loss_bbox_cls, loss_bbox_reg = self._get_head_loss(score, delta,
-                                                               target)
-            loss_weight = 1. / 2**lvl
-            loss_bbox[cls_name] = loss_bbox_cls * loss_weight
-            loss_bbox[reg_name] = loss_bbox_reg * loss_weight
        return loss_bbox
-    def get_prediction(self, bbox_head_out, rois):
+    def get_prediction(self, score, delta):
-        proposal, proposal_num = rois
-        score, delta = bbox_head_out
        bbox_prob = F.softmax(score)
-        delta = paddle.reshape(delta, (-1, self.delta_dim, 4))
+        return delta, bbox_prob
-        bbox_pred = (delta, bbox_prob)
-        return bbox_pred, rois
-    def get_cascade_prediction(self, bbox_head_out, rois):
-        proposal_list = []
-        prob_list = []
-        delta_list = []
-        for stage in range(len(rois)):
-            proposals = rois[stage]
-            bboxhead = bbox_head_out[stage]
-            score, delta = bboxhead
-            proposal, proposal_num = proposals
-            if stage in self.score_stage:
-                if stage < 2:
-                    _, head_out, _ = self(stage=stage, roi_stage=-1)
-                    score = head_out[0]
-                bbox_prob = F.softmax(score)
+    def get_head(self, ):
-                prob_list.append(bbox_prob)
+        return self.head
-            if stage in self.delta_stage:
-                proposal_list.append(proposal)
+    def get_assigned_targets(self, ):
-                delta_list.append(delta)
+        return self.assigned_targets
-        bbox_prob = paddle.mean(paddle.stack(prob_list), axis=0)
-        delta = paddle.mean(paddle.stack(delta_list), axis=0)
+    def get_assigned_rois(self, ):
-        proposal = paddle.mean(paddle.stack(proposal_list), axis=0)
+        return self.assigned_rois
-        delta = paddle.reshape(delta, (-1, self.delta_dim, 4))
-        if self.cls_agnostic:
-            N, C, M = delta.shape
-            delta = delta[:, 1:2, :]
-            delta = paddle.expand(delta, [N, self.num_classes, M])
-        bboxes = (proposal, proposal_num)
-        bbox_pred = (delta, bbox_prob)
-        return bbox_pred, bboxes
--- a/dygraph/ppdet/modeling/heads/mask_head.py
+++ b/dygraph/ppdet/modeling/heads/mask_head.py
@@ -13,195 +13,196 @@
 # limitations under the License.
 import paddle
+import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn import Layer, Sequential
-from paddle.nn import Conv2D, Conv2DTranspose, ReLU
 from paddle.nn.initializer import KaimingNormal
 from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register
+from ppdet.core.workspace import register, create
 from ppdet.modeling import ops
+from .roi_extractor import RoIAlign
-@register
-class MaskFeat(Layer):
-    __inject__ = ['mask_roi_extractor']
-    def __init__(self,
+@register
-                 mask_roi_extractor=None,
+class MaskFeat(nn.Layer):
-                 num_convs=0,
+    def __init__(self, num_convs=0, in_channels=2048, out_channels=256):
-                 feat_in=2048,
-                 feat_out=256,
-                 mask_num_stages=1,
-                 share_bbox_feat=False):
        super(MaskFeat, self).__init__()
        self.num_convs = num_convs
-        self.feat_in = feat_in
+        self.in_channels = in_channels
-        self.feat_out = feat_out
+        self.out_channels = out_channels
-        self.mask_roi_extractor = mask_roi_extractor
+        fan_conv = out_channels * 3 * 3
-        self.mask_num_stages = mask_num_stages
+        fan_deconv = out_channels * 2 * 2
-        self.share_bbox_feat = share_bbox_feat
-        self.upsample_module = []
+        mask_conv = nn.Sequential()
-        fan_conv = feat_out * 3 * 3
+        for i in range(self.num_convs):
-        fan_deconv = feat_out * 2 * 2
+            conv_name = 'mask_inter_feat_{}'.format(i + 1)
-        for i in range(self.mask_num_stages):
-            name = 'stage_{}'.format(i)
-            mask_conv = Sequential()
-            for j in range(self.num_convs):
-                conv_name = 'mask_inter_feat_{}'.format(j + 1)
            mask_conv.add_sublayer(
                conv_name,
-                    Conv2D(
+                nn.Conv2D(
-                        in_channels=feat_in if j == 0 else feat_out,
+                    in_channels=in_channels if i == 0 else out_channels,
-                        out_channels=feat_out,
+                    out_channels=out_channels,
                    kernel_size=3,
                    padding=1,
-                        weight_attr=ParamAttr(
+                    weight_attr=paddle.ParamAttr(
-                            initializer=KaimingNormal(fan_in=fan_conv)),
+                        initializer=KaimingNormal(fan_in=fan_conv))))
-                        bias_attr=ParamAttr(
+            mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
-                            learning_rate=2., regularizer=L2Decay(0.))))
-                mask_conv.add_sublayer(conv_name + 'act', ReLU())
        mask_conv.add_sublayer(
            'conv5_mask',
-                Conv2DTranspose(
+            nn.Conv2DTranspose(
-                    in_channels=self.feat_in,
+                in_channels=self.in_channels,
-                    out_channels=self.feat_out,
+                out_channels=self.out_channels,
                kernel_size=2,
                stride=2,
-                    weight_attr=ParamAttr(
+                weight_attr=paddle.ParamAttr(
-                        initializer=KaimingNormal(fan_in=fan_deconv)),
+                    initializer=KaimingNormal(fan_in=fan_deconv))))
-                    bias_attr=ParamAttr(
+        mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU())
-                        learning_rate=2., regularizer=L2Decay(0.))))
+        self.upsample = mask_conv
-            mask_conv.add_sublayer('conv5_mask' + 'act', ReLU())
-            upsample = self.add_sublayer(name, mask_conv)
-            self.upsample_module.append(upsample)
-    def forward(self,
+    @classmethod
-                body_feats,
+    def from_config(cls, cfg, input_shape):
-                bboxes,
+        if isinstance(input_shape, (list, tuple)):
-                bbox_feat,
+            input_shape = input_shape[0]
-                mask_index,
+        return {'in_channels': input_shape.channels, }
-                spatial_scale,
-                stage=0,
+    def out_channel(self):
-                bbox_head_feat_func=None):
+        return self.out_channels
-        if self.share_bbox_feat and mask_index is not None:
-            rois_feat = paddle.gather(bbox_feat, mask_index)
-        else:
-            rois_feat = self.mask_roi_extractor(body_feats, bboxes,
-                                                spatial_scale)
-        if self.share_bbox_feat and bbox_head_feat_func is not None and not self.training:
-            rois_feat = bbox_head_feat_func(rois_feat)
-        # upsample 
+    def forward(self, feats):
-        mask_feat = self.upsample_module[stage](rois_feat)
+        return self.upsample(feats)
-        return mask_feat
 @register
-class MaskHead(Layer):
+class MaskHead(nn.Layer):
-    __shared__ = ['num_classes', 'mask_num_stages']
+    __shared__ = ['num_classes']
-    __inject__ = ['mask_feat']
+    __inject__ = ['mask_assigner']
    def __init__(self,
-                 mask_feat,
+                 head,
-                 feat_in=256,
+                 roi_extractor=RoIAlign().__dict__,
-                 num_classes=81,
+                 mask_assigner='MaskAssigner',
-                 mask_num_stages=1):
+                 num_classes=80,
+                 share_bbox_feat=False):
        super(MaskHead, self).__init__()
-        self.mask_feat = mask_feat
-        self.feat_in = feat_in
        self.num_classes = num_classes
-        self.mask_num_stages = mask_num_stages
-        self.mask_fcn_logits = []
+        self.roi_extractor = roi_extractor
-        for i in range(self.mask_num_stages):
+        if isinstance(roi_extractor, dict):
-            name = 'mask_fcn_logits_{}'.format(i)
+            self.roi_extractor = RoIAlign(**roi_extractor)
-            self.mask_fcn_logits.append(
+        self.head = head
-                self.add_sublayer(
+        self.in_channels = head.out_channel()
-                    name,
+        self.mask_assigner = mask_assigner
-                    Conv2D(
+        self.share_bbox_feat = share_bbox_feat
-                        in_channels=self.feat_in,
+        self.bbox_head = None
+        self.mask_fcn_logits = nn.Conv2D(
+            in_channels=self.in_channels,
            out_channels=self.num_classes,
            kernel_size=1,
-                        weight_attr=ParamAttr(initializer=KaimingNormal(
+            weight_attr=paddle.ParamAttr(initializer=KaimingNormal(
-                            fan_in=self.num_classes)),
+                fan_in=self.num_classes)))
-                        bias_attr=ParamAttr(
-                            learning_rate=2., regularizer=L2Decay(0.0)))))
-    def forward_train(self,
+    @classmethod
-                      body_feats,
+    def from_config(cls, cfg, input_shape):
-                      bboxes,
+        roi_pooler = cfg['roi_extractor']
-                      bbox_feat,
+        assert isinstance(roi_pooler, dict)
-                      mask_index,
+        kwargs = RoIAlign.from_config(cfg, input_shape)
-                      spatial_scale,
+        roi_pooler.update(kwargs)
-                      stage=0):
+        kwargs = {'input_shape': input_shape}
-        # feat
+        head = create(cfg['head'], **kwargs)
-        mask_feat = self.mask_feat(body_feats, bboxes, bbox_feat, mask_index,
+        return {
-                                   spatial_scale, stage)
+            'roi_extractor': roi_pooler,
-        # logits
+            'head': head,
-        mask_head_out = self.mask_fcn_logits[stage](mask_feat)
+        }
-        return mask_head_out
+    def get_loss(self, mask_logits, mask_label, mask_target, mask_weight):
+        mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3])
+        mask_label = paddle.expand_as(mask_label, mask_logits)
+        mask_label.stop_gradient = True
+        mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
+        shape = mask_logits.shape
+        mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]])
+        mask_target = mask_target.cast('float32')
+        mask_weight = mask_weight.unsqueeze([1, 2])
+        loss_mask = F.binary_cross_entropy_with_logits(
+            mask_pred, mask_target, weight=mask_weight, reduction="mean")
+        return loss_mask
+    def forward_train(self, body_feats, rois, rois_num, inputs, targets,
+                      bbox_feat):
+        """
+        body_feats (list[Tensor]): Multi-level backbone features
+        rois (list[Tensor]): Proposals for each batch with shape [N, 4]
+        rois_num (Tensor): The number of proposals for each batch
+        inputs (dict): ground truth info
+        """
+        #assert self.bbox_head
+        tgt_labels, _, tgt_gt_inds = targets
+        rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(
+            rois, tgt_labels, tgt_gt_inds, inputs)
+        if self.share_bbox_feat:
+            rois_feat = paddle.gather(bbox_feat, mask_index)
+        else:
+            rois_feat = self.roi_extractor(body_feats, rois, rois_num)
+        mask_feat = self.head(rois_feat)
+        mask_logits = self.mask_fcn_logits(mask_feat)
+        loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks,
+                                  tgt_weights)
+        return {'loss_mask': loss_mask}
    def forward_test(self,
-                     scale_factor,
                     body_feats,
-                     bboxes,
+                     rois,
-                     bbox_feat,
+                     rois_num,
-                     mask_index,
+                     scale_factor,
-                     spatial_scale,
+                     feat_func=None):
-                     stage=0,
+        """
-                     bbox_head_feat_func=None):
+        body_feats (list[Tensor]): Multi-level backbone features
-        bbox, bbox_num = bboxes
+        rois (Tensor): Prediction from bbox head with shape [N, 6]
+        rois_num (Tensor): The number of prediction for each batch
-        if bbox.shape[0] == 0:
+        scale_factor (Tensor): The scale factor from origin size to input size
-            mask_head_out = paddle.full([1, 6], -1)
+        """
+        if rois.shape[0] == 0:
+            mask_out = paddle.full([1, 1, 1, 1], -1)
+        else:
+            bbox = [rois[:, 2:]]
+            labels = rois[:, 0].cast('int32')
+            rois_feat = self.roi_extractor(body_feats, bbox, rois_num)
+            if self.share_bbox_feat:
+                assert feat_func is not None
+                rois_feat = feat_func(rois_feat)
+            mask_feat = self.head(rois_feat)
+            mask_logit = self.mask_fcn_logits(mask_feat)
+            mask_num_class = mask_logit.shape[1]
+            if mask_num_class == 1:
+                mask_out = F.sigmoid(mask_logit)
            else:
-            scale_factor_list = []
+                num_masks = mask_logit.shape[0]
-            for idx in range(bbox_num.shape[0]):
+                pred_masks = paddle.split(mask_logit, num_masks)
-                num = bbox_num[idx]
+                mask_out = []
-                scale = scale_factor[idx, 0]
+                # TODO: need to optimize gather
-                ones = paddle.ones(num)
+                for i, pred_mask in enumerate(pred_masks):
-                scale_expand = ones * scale
+                    mask = paddle.gather(pred_mask, labels[i], axis=1)
-                scale_factor_list.append(scale_expand)
+                    mask_out.append(mask)
-            scale_factor_list = paddle.cast(
+                mask_out = F.sigmoid(paddle.concat(mask_out))
-                paddle.concat(scale_factor_list), 'float32')
+        return mask_out
-            scale_factor_list = paddle.reshape(scale_factor_list, shape=[-1, 1])
-            scaled_bbox = paddle.multiply(bbox[:, 2:], scale_factor_list)
-            scaled_bboxes = (scaled_bbox, bbox_num)
-            mask_feat = self.mask_feat(body_feats, scaled_bboxes, bbox_feat,
-                                       mask_index, spatial_scale, stage,
-                                       bbox_head_feat_func)
-            mask_logit = self.mask_fcn_logits[stage](mask_feat)
-            mask_head_out = F.sigmoid(mask_logit)
-        return mask_head_out
    def forward(self,
-                inputs,
                body_feats,
-                bboxes,
+                rois,
-                bbox_feat,
+                rois_num,
-                mask_index,
+                inputs,
-                spatial_scale,
+                targets=None,
-                bbox_head_feat_func=None,
+                bbox_feat=None,
-                stage=0):
+                feat_func=None):
        if self.training:
-            mask_head_out = self.forward_train(body_feats, bboxes, bbox_feat,
+            return self.forward_train(body_feats, rois, rois_num, inputs,
-                                               mask_index, spatial_scale, stage)
+                                      targets, bbox_feat)
        else:
-            scale_factor = inputs['scale_factor']
+            im_scale = inputs['scale_factor']
-            mask_head_out = self.forward_test(
+            return self.forward_test(body_feats, rois, rois_num, im_scale,
-                scale_factor, body_feats, bboxes, bbox_feat, mask_index,
+                                     feat_func)
-                spatial_scale, stage, bbox_head_feat_func)
-        return mask_head_out
-    def get_loss(self, mask_head_out, mask_target):
-        mask_logits = paddle.flatten(mask_head_out, start_axis=1, stop_axis=-1)
-        mask_label = paddle.cast(x=mask_target, dtype='float32')
-        mask_label.stop_gradient = True
-        loss_mask = ops.sigmoid_cross_entropy_with_logits(
-            input=mask_logits,
-            label=mask_label,
-            ignore_index=-1,
-            normalize=True)
-        loss_mask = paddle.sum(loss_mask)
-        return {'loss_mask': loss_mask}
--- a/dygraph/ppdet/modeling/heads/roi_extractor.py
+++ b/dygraph/ppdet/modeling/heads/roi_extractor.py
@@ -17,32 +17,47 @@ from ppdet.core.workspace import register
 from ppdet.modeling import ops
+def _to_list(v):
+    if not isinstance(v, (list, tuple)):
+        return [v]
+    return v
 @register
 class RoIAlign(object):
    def __init__(self,
                 resolution=14,
+                 spatial_scale=0.0625,
                 sampling_ratio=0,
                 canconical_level=4,
                 canonical_size=224,
                 start_level=0,
-                 end_level=3):
+                 end_level=3,
+                 aligned=False):
        super(RoIAlign, self).__init__()
        self.resolution = resolution
+        self.spatial_scale = _to_list(spatial_scale)
        self.sampling_ratio = sampling_ratio
        self.canconical_level = canconical_level
        self.canonical_size = canonical_size
        self.start_level = start_level
        self.end_level = end_level
+        self.aligned = aligned
-    def __call__(self, feats, rois, spatial_scale):
+    @classmethod
-        roi, rois_num = rois
+    def from_config(cls, cfg, input_shape):
-        if self.start_level == self.end_level:
+        return {'spatial_scale': [1. / i.stride for i in input_shape]}
+    def __call__(self, feats, roi, rois_num):
+        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
+        if len(feats) == 1:
            rois_feat = ops.roi_align(
                feats[self.start_level],
                roi,
                self.resolution,
-                spatial_scale,
+                self.spatial_scale[0],
-                rois_num=rois_num)
+                rois_num=rois_num,
+                aligned=self.aligned)
        else:
            offset = 2
            k_min = self.start_level + offset
@@ -60,9 +75,11 @@ class RoIAlign(object):
                    feats[lvl],
                    rois_dist[lvl],
                    self.resolution,
-                    spatial_scale[lvl],
+                    self.spatial_scale[lvl],
                    sampling_ratio=self.sampling_ratio,
-                    rois_num=rois_num_dist[lvl])
+                    rois_num=rois_num_dist[lvl],
+                    aligned=self.aligned)
+                if roi_feat.shape[0] > 0:
                    rois_feat_list.append(roi_feat)
            rois_feat_shuffle = paddle.concat(rois_feat_list)
            rois_feat = paddle.gather(rois_feat_shuffle, restore_index)

--- a/dygraph/ppdet/modeling/layers.py
+++ b/dygraph/ppdet/modeling/layers.py
@@ -27,9 +27,9 @@ from paddle.nn.initializer import Normal, Constant
 from paddle.regularizer import L2Decay
 from ppdet.core.workspace import register, serializable
-from ppdet.py_op.target import generate_rpn_anchor_target, generate_proposal_target, generate_mask_target
+from ppdet.modeling.bbox_utils import delta2bbox
-from ppdet.py_op.post_process import bbox_post_process
 from . import ops
 from paddle.vision.ops import DeformConv2D
@@ -223,53 +223,6 @@ class AnchorGeneratorRPN(object):
        return anchor, var
-@register
-@serializable
-class AnchorTargetGeneratorRPN(object):
-    def __init__(self,
-                 batch_size_per_im=256,
-                 straddle_thresh=0.,
-                 fg_fraction=0.5,
-                 positive_overlap=0.7,
-                 negative_overlap=0.3,
-                 use_random=True):
-        super(AnchorTargetGeneratorRPN, self).__init__()
-        self.batch_size_per_im = batch_size_per_im
-        self.straddle_thresh = straddle_thresh
-        self.fg_fraction = fg_fraction
-        self.positive_overlap = positive_overlap
-        self.negative_overlap = negative_overlap
-        self.use_random = use_random
-    def __call__(self, cls_logits, bbox_pred, anchor_box, gt_boxes, is_crowd,
-                 im_info):
-        anchor_box = anchor_box.numpy()
-        gt_boxes = gt_boxes.numpy()
-        is_crowd = is_crowd.numpy()
-        im_info = im_info.numpy()
-        loc_indexes, score_indexes, tgt_labels, tgt_bboxes, bbox_inside_weights = generate_rpn_anchor_target(
-            anchor_box, gt_boxes, is_crowd, im_info, self.straddle_thresh,
-            self.batch_size_per_im, self.positive_overlap,
-            self.negative_overlap, self.fg_fraction, self.use_random)
-        loc_indexes = to_tensor(loc_indexes)
-        score_indexes = to_tensor(score_indexes)
-        tgt_labels = to_tensor(tgt_labels)
-        tgt_bboxes = to_tensor(tgt_bboxes)
-        bbox_inside_weights = to_tensor(bbox_inside_weights)
-        loc_indexes.stop_gradient = True
-        score_indexes.stop_gradient = True
-        tgt_labels.stop_gradient = True
-        cls_logits = paddle.reshape(x=cls_logits, shape=(-1, ))
-        bbox_pred = paddle.reshape(x=bbox_pred, shape=(-1, 4))
-        pred_cls_logits = paddle.gather(cls_logits, score_indexes)
-        pred_bbox_pred = paddle.gather(bbox_pred, loc_indexes)
-        return pred_cls_logits, pred_bbox_pred, tgt_labels, tgt_bboxes, bbox_inside_weights
 @register
 @serializable
 class AnchorGeneratorSSD(object):
@@ -335,248 +288,52 @@ class AnchorGeneratorSSD(object):
        return boxes
-@register
-@serializable
-class ProposalGenerator(object):
-    __append_doc__ = True
-    def __init__(self,
-                 train_pre_nms_top_n=12000,
-                 train_post_nms_top_n=2000,
-                 infer_pre_nms_top_n=6000,
-                 infer_post_nms_top_n=1000,
-                 nms_thresh=.5,
-                 min_size=.1,
-                 eta=1.):
-        super(ProposalGenerator, self).__init__()
-        self.train_pre_nms_top_n = train_pre_nms_top_n
-        self.train_post_nms_top_n = train_post_nms_top_n
-        self.infer_pre_nms_top_n = infer_pre_nms_top_n
-        self.infer_post_nms_top_n = infer_post_nms_top_n
-        self.nms_thresh = nms_thresh
-        self.min_size = min_size
-        self.eta = eta
-    def __call__(self,
-                 scores,
-                 bbox_deltas,
-                 anchors,
-                 variances,
-                 im_shape,
-                 is_train=False):
-        pre_nms_top_n = self.train_pre_nms_top_n if is_train else self.infer_pre_nms_top_n
-        post_nms_top_n = self.train_post_nms_top_n if is_train else self.infer_post_nms_top_n
-        # TODO delete im_info
-        if im_shape.shape[1] > 2:
-            import paddle.fluid as fluid
-            rpn_rois, rpn_rois_prob, rpn_rois_num = fluid.layers.generate_proposals(
-                scores,
-                bbox_deltas,
-                im_shape,
-                anchors,
-                variances,
-                pre_nms_top_n=pre_nms_top_n,
-                post_nms_top_n=post_nms_top_n,
-                nms_thresh=self.nms_thresh,
-                min_size=self.min_size,
-                eta=self.eta,
-                return_rois_num=True)
-        else:
-            rpn_rois, rpn_rois_prob, rpn_rois_num = ops.generate_proposals(
-                scores,
-                bbox_deltas,
-                im_shape,
-                anchors,
-                variances,
-                pre_nms_top_n=pre_nms_top_n,
-                post_nms_top_n=post_nms_top_n,
-                nms_thresh=self.nms_thresh,
-                min_size=self.min_size,
-                eta=self.eta,
-                return_rois_num=True)
-        return rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n
-@register
-@serializable
-class ProposalTargetGenerator(object):
-    __shared__ = ['num_classes']
-    def __init__(self,
-                 batch_size_per_im=512,
-                 fg_fraction=.25,
-                 fg_thresh=[.5, ],
-                 bg_thresh_hi=[.5, ],
-                 bg_thresh_lo=[0., ],
-                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                 num_classes=81,
-                 use_random=True,
-                 is_cls_agnostic=False):
-        super(ProposalTargetGenerator, self).__init__()
-        self.batch_size_per_im = batch_size_per_im
-        self.fg_fraction = fg_fraction
-        self.fg_thresh = fg_thresh
-        self.bg_thresh_hi = bg_thresh_hi
-        self.bg_thresh_lo = bg_thresh_lo
-        self.bbox_reg_weights = bbox_reg_weights
-        self.num_classes = num_classes
-        self.use_random = use_random
-        self.is_cls_agnostic = is_cls_agnostic
-    def __call__(self,
-                 rpn_rois,
-                 rpn_rois_num,
-                 gt_classes,
-                 is_crowd,
-                 gt_boxes,
-                 im_info,
-                 stage=0,
-                 max_overlap=None):
-        rpn_rois = rpn_rois.numpy()
-        rpn_rois_num = rpn_rois_num.numpy()
-        gt_classes = gt_classes.numpy()
-        gt_boxes = gt_boxes.numpy()
-        is_crowd = is_crowd.numpy()
-        im_info = im_info.numpy()
-        max_overlap = max_overlap if max_overlap is None else max_overlap.numpy(
-        )
-        reg_weights = [i / (stage + 1) for i in self.bbox_reg_weights]
-        is_cascade = True if stage > 0 else False
-        num_classes = 2 if is_cascade else self.num_classes
-        outs = generate_proposal_target(
-            rpn_rois, rpn_rois_num, gt_classes, is_crowd, gt_boxes, im_info,
-            self.batch_size_per_im, self.fg_fraction, self.fg_thresh[stage],
-            self.bg_thresh_hi[stage], self.bg_thresh_lo[stage], reg_weights,
-            num_classes, self.use_random, self.is_cls_agnostic, is_cascade,
-            max_overlap)
-        outs = [to_tensor(v) for v in outs]
-        for v in outs:
-            v.stop_gradient = True
-        return outs
-@register
-@serializable
-class MaskTargetGenerator(object):
-    __shared__ = ['num_classes', 'mask_resolution']
-    def __init__(self, num_classes=81, mask_resolution=14):
-        super(MaskTargetGenerator, self).__init__()
-        self.num_classes = num_classes
-        self.mask_resolution = mask_resolution
-    def __call__(self, im_info, gt_classes, is_crowd, gt_segms, rois, rois_num,
-                 labels_int32):
-        im_info = im_info.numpy()
-        gt_classes = gt_classes.numpy()
-        is_crowd = is_crowd.numpy()
-        gt_segms = gt_segms.numpy()
-        rois = rois.numpy()
-        rois_num = rois_num.numpy()
-        labels_int32 = labels_int32.numpy()
-        outs = generate_mask_target(im_info, gt_classes, is_crowd, gt_segms,
-                                    rois, rois_num, labels_int32,
-                                    self.num_classes, self.mask_resolution)
-        outs = [to_tensor(v) for v in outs]
-        for v in outs:
-            v.stop_gradient = True
-        return outs
 @register
 @serializable
 class RCNNBox(object):
-    __shared__ = ['num_classes', 'batch_size']
    def __init__(self,
-                 num_classes=81,
+                 prior_box_var=[10., 10., 5., 5.],
-                 batch_size=1,
-                 prior_box_var=[0.1, 0.1, 0.2, 0.2],
                 code_type="decode_center_size",
-                 box_normalized=False,
+                 box_normalized=False):
-                 axis=1,
-                 var_weight=1.):
        super(RCNNBox, self).__init__()
-        self.num_classes = num_classes
-        self.batch_size = batch_size
        self.prior_box_var = prior_box_var
        self.code_type = code_type
        self.box_normalized = box_normalized
-        self.axis = axis
-        self.var_weight = var_weight
    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
        bbox_pred, cls_prob = bbox_head_out
        roi, rois_num = rois
-        origin_shape = im_shape / scale_factor
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
        scale_list = []
        origin_shape_list = []
-        for idx in range(self.batch_size):
+        for idx, roi_per_im in enumerate(roi):
-            scale = scale_factor[idx, :][0]
            rois_num_per_im = rois_num[idx]
-            expand_scale = paddle.expand(scale, [rois_num_per_im, 1])
+            expand_im_shape = paddle.expand(im_shape[idx, :],
-            scale_list.append(expand_scale)
-            expand_im_shape = paddle.expand(origin_shape[idx, :],
                                            [rois_num_per_im, 2])
            origin_shape_list.append(expand_im_shape)
-        scale = paddle.concat(scale_list)
        origin_shape = paddle.concat(origin_shape_list)
-        bbox = roi / scale
+        # [N, C*4]
-        prior_box_var = [i / self.var_weight for i in self.prior_box_var]
+        bbox = paddle.concat(roi)
-        bbox = ops.box_coder(
+        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
-            prior_box=bbox,
+        scores = cls_prob[:, :-1]
-            prior_box_var=prior_box_var,
-            target_box=bbox_pred,
+        # [N*C, 4]
-            code_type=self.code_type,
-            box_normalized=self.box_normalized,
+        bbox_num_class = bbox.shape[1] // 4
-            axis=self.axis)
+        bbox = paddle.reshape(bbox, [-1, bbox_num_class, 4])
-        # TODO: Updata box_clip
-        origin_h = paddle.unsqueeze(origin_shape[:, 0] - 1, axis=1)
+        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
-        origin_w = paddle.unsqueeze(origin_shape[:, 1] - 1, axis=1)
+        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
-        zeros = paddle.zeros(paddle.shape(origin_h), 'float32')
+        zeros = paddle.zeros_like(origin_h)
        x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)
        y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)
        x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)
        y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)
        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
        bboxes = (bbox, rois_num)
-        return bboxes, cls_prob
+        return bboxes, scores
-@register
-@serializable
-class DecodeClipNms(object):
-    __shared__ = ['num_classes']
-    def __init__(
-            self,
-            num_classes=81,
-            keep_top_k=100,
-            score_threshold=0.05,
-            nms_threshold=0.5, ):
-        super(DecodeClipNms, self).__init__()
-        self.num_classes = num_classes
-        self.keep_top_k = keep_top_k
-        self.score_threshold = score_threshold
-        self.nms_threshold = nms_threshold
-    def __call__(self, bboxes, bbox_prob, bbox_delta, im_info):
-        bboxes_np = (i.numpy() for i in bboxes)
-        # bbox, bbox_num
-        outs = bbox_post_process(bboxes_np,
-                                 bbox_prob.numpy(),
-                                 bbox_delta.numpy(),
-                                 im_info.numpy(), self.keep_top_k,
-                                 self.score_threshold, self.nms_threshold,
-                                 self.num_classes)
-        outs = [to_tensor(v) for v in outs]
-        for v in outs:
-            v.stop_gradient = True
-        return outs
 @register
@@ -589,7 +346,6 @@ class MultiClassNMS(object):
                 nms_threshold=.5,
                 normalized=False,
                 nms_eta=1.0,
-                 background_label=0,
                 return_rois_num=True):
        super(MultiClassNMS, self).__init__()
        self.score_threshold = score_threshold
@@ -598,14 +354,28 @@ class MultiClassNMS(object):
        self.nms_threshold = nms_threshold
        self.normalized = normalized
        self.nms_eta = nms_eta
-        self.background_label = background_label
        self.return_rois_num = return_rois_num
-    def __call__(self, bboxes, score):
+    def __call__(self, bboxes, score, background_label=-1):
+        """
+        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape 
+                                         [N, M, 4], N is the batch size and M
+                                         is the number of bboxes
+                                      2. (List[Tensor]) bboxes and bbox_num,
+                                         bboxes have shape of [M, C, 4], C
+                                         is the class number and bbox_num means
+                                         the number of bboxes of each batch with
+                                         shape [N,] 
+        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
+        background_label (int): Ignore the background label; For example, RCNN
+                                is num_classes and YOLO is -1. 
+        """
        kwargs = self.__dict__.copy()
        if isinstance(bboxes, tuple):
            bboxes, bbox_num = bboxes
            kwargs.update({'rois_num': bbox_num})
+        if background_label > -1:
+            kwargs.update({'background_label': background_label})
        return ops.multiclass_nms(bboxes, score, **kwargs)

--- a/dygraph/ppdet/modeling/mask.py
+++ b/dygraph/ppdet/modeling/mask.py
-import numpy as np
-from ppdet.core.workspace import register
-@register
-class Mask(object):
-    __inject__ = ['mask_target_generator']
-    def __init__(self, mask_target_generator):
-        super(Mask, self).__init__()
-        self.mask_target_generator = mask_target_generator
-    def __call__(self, inputs, rois, targets):
-        mask_rois, rois_has_mask_int32 = self.generate_mask_target(inputs, rois,
-                                                                   targets)
-        return mask_rois, rois_has_mask_int32
-    def generate_mask_target(self, inputs, rois, targets):
-        labels_int32 = targets['labels_int32']
-        proposals, proposals_num = rois
-        mask_rois, mask_rois_num, self.rois_has_mask_int32, self.mask_int32 = self.mask_target_generator(
-            im_info=inputs['im_info'],
-            gt_classes=inputs['gt_class'],
-            is_crowd=inputs['is_crowd'],
-            gt_segms=inputs['gt_poly'],
-            rois=proposals,
-            rois_num=proposals_num,
-            labels_int32=labels_int32)
-        self.mask_rois = (mask_rois, mask_rois_num)
-        return self.mask_rois, self.rois_has_mask_int32
-    def get_targets(self):
-        return self.mask_int32
--- a/dygraph/ppdet/modeling/necks/fpn.py
+++ b/dygraph/ppdet/modeling/necks/fpn.py
@@ -21,6 +21,7 @@ from paddle.nn import Conv2D
 from paddle.nn.initializer import XavierUniform
 from paddle.regularizer import L2Decay
 from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
 @register
@@ -29,18 +30,19 @@ class FPN(Layer):
    def __init__(self,
                 in_channels,
                 out_channel,
-                 min_level=0,
+                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
-                 max_level=4,
-                 spatial_scale=[0.25, 0.125, 0.0625, 0.03125],
                 has_extra_convs=False,
+                 extra_stage=1,
                 use_c5=True,
                 relu_before_extra_convs=True):
        super(FPN, self).__init__()
-        self.min_level = min_level
+        self.out_channel = out_channel
-        self.max_level = max_level
+        for s in range(extra_stage):
-        self.spatial_scale = spatial_scale
+            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
+        self.spatial_scales = spatial_scales
        self.has_extra_convs = has_extra_convs
+        self.extra_stage = extra_stage
        self.use_c5 = use_c5
        self.relu_before_extra_convs = relu_before_extra_convs
@@ -48,11 +50,7 @@ class FPN(Layer):
        self.fpn_convs = []
        fan = out_channel * 3 * 3
-        self.num_backbone_stages = len(spatial_scale)
+        for i in range(len(in_channels)):
-        self.num_outs = self.max_level - self.min_level + 1
-        self.highest_backbone_level = self.min_level + self.num_backbone_stages - 1
-        for i in range(self.min_level, self.highest_backbone_level + 1):
            if i == 3:
                lateral_name = 'fpn_inner_res5_sum'
            else:
@@ -65,9 +63,7 @@ class FPN(Layer):
                    out_channels=out_channel,
                    kernel_size=1,
                    weight_attr=ParamAttr(
-                        initializer=XavierUniform(fan_out=in_c)),
+                        initializer=XavierUniform(fan_out=in_c))))
-                    bias_attr=ParamAttr(
-                        learning_rate=2., regularizer=L2Decay(0.))))
            self.lateral_convs.append(lateral)
            fpn_name = 'fpn_res{}_sum'.format(i + 2)
@@ -79,17 +75,14 @@ class FPN(Layer):
                    kernel_size=3,
                    padding=1,
                    weight_attr=ParamAttr(
-                        initializer=XavierUniform(fan_out=fan)),
+                        initializer=XavierUniform(fan_out=fan))))
-                    bias_attr=ParamAttr(
-                        learning_rate=2., regularizer=L2Decay(0.))))
            self.fpn_convs.append(fpn_conv)
        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
-        if self.has_extra_convs and self.num_outs > self.num_backbone_stages:
+        if self.has_extra_convs:
-            for lvl in range(self.highest_backbone_level + 1,
+            for lvl in range(self.extra_stage):  # P6 P7 ...
-                             self.max_level + 1):  # P6 P7 ...
+                if lvl == 0 and self.use_c5:
-                if lvl == self.highest_backbone_level + 1 and self.use_c5:
+                    in_c = in_channels[-1]
-                    in_c = in_channels[self.highest_backbone_level]
                else:
                    in_c = out_channel
                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
@@ -102,51 +95,60 @@ class FPN(Layer):
                        stride=2,
                        padding=1,
                        weight_attr=ParamAttr(
-                            initializer=XavierUniform(fan_out=fan)),
+                            initializer=XavierUniform(fan_out=fan))))
-                        bias_attr=ParamAttr(
-                            learning_rate=2., regularizer=L2Decay(0.))))
                self.fpn_convs.append(extra_fpn_conv)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
    def forward(self, body_feats):
        laterals = []
-        used_backbone_levels = len(self.spatial_scale)
+        num_levels = len(body_feats)
-        for i in range(used_backbone_levels):
+        for i in range(num_levels):
            laterals.append(self.lateral_convs[i](body_feats[i]))
-        used_backbone_levels = len(self.spatial_scale)
+        for i in range(1, num_levels):
-        for i in range(used_backbone_levels - 1):
+            lvl = num_levels - i
-            idx = used_backbone_levels - 1 - i
            upsample = F.interpolate(
-                laterals[idx],
+                laterals[lvl],
                scale_factor=2.,
                mode='nearest', )
-            laterals[idx - 1] += upsample
+            laterals[lvl - 1] += upsample
        fpn_output = []
-        for lvl in range(self.min_level, self.highest_backbone_level + 1):
+        for lvl in range(num_levels):
-            i = lvl - self.min_level
+            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
-            fpn_output.append(self.fpn_convs[i](laterals[i]))
-        spatial_scales = self.spatial_scale
+        if self.extra_stage > 0:
-        if self.num_outs > len(fpn_output):
            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
            if not self.has_extra_convs:
+                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
-                spatial_scales = spatial_scales + [spatial_scales[-1] * 0.5]
            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
            else:
                if self.use_c5:
                    extra_source = body_feats[-1]
                else:
                    extra_source = fpn_output[-1]
-                fpn_output.append(self.fpn_convs[used_backbone_levels](
+                fpn_output.append(self.fpn_convs[num_levels](extra_source))
-                    extra_source))
-                spatial_scales = spatial_scales + [spatial_scales[-1] * 0.5]
+                for i in range(1, self.extra_stage):
-                for i in range(used_backbone_levels + 1, self.num_outs):
                    if self.relu_before_extra_convs:
-                        fpn_output.append(self.fpn_convs[i](F.relu(fpn_output[
+                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
-                            -1])))
+                            fpn_output[-1])))
                    else:
-                        fpn_output.append(self.fpn_convs[i](fpn_output[-1]))
+                        fpn_output.append(self.fpn_convs[num_levels + i](
-                    spatial_scales = spatial_scales + [spatial_scales[-1] * 0.5]
+                            fpn_output[-1]))
-        return fpn_output, spatial_scales
+        return fpn_output
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=1. / s)
+            for s in self.spatial_scales
+        ]
--- a/dygraph/ppdet/modeling/ops.py
+++ b/dygraph/ppdet/modeling/ops.py
@@ -32,7 +32,6 @@ __all__ = [
    'roi_pool',
    'roi_align',
    'prior_box',
-    'anchor_generator',
    'generate_proposals',
    'iou_similarity',
    'box_coder',
@@ -169,6 +168,7 @@ def roi_align(input,
              spatial_scale=1.0,
              sampling_ratio=-1,
              rois_num=None,
+              aligned=True,
              name=None):
    """
@@ -239,7 +239,7 @@ def roi_align(input,
        align_out = core.ops.roi_align(
            input, rois, rois_num, "pooled_height", pooled_height,
            "pooled_width", pooled_width, "spatial_scale", spatial_scale,
-            "sampling_ratio", sampling_ratio)
+            "sampling_ratio", sampling_ratio)  #, "aligned", aligned)
        return align_out
    else:
@@ -264,7 +264,8 @@ def roi_align(input,
                "pooled_height": pooled_height,
                "pooled_width": pooled_width,
                "spatial_scale": spatial_scale,
-                "sampling_ratio": sampling_ratio
+                "sampling_ratio": sampling_ratio,
+                #"aligned": aligned,
            })
        return align_out
@@ -846,117 +847,6 @@ def prior_box(input,
        return box, var
-@paddle.jit.not_to_static
-def anchor_generator(input,
-                     anchor_sizes=None,
-                     aspect_ratios=None,
-                     variance=[0.1, 0.1, 0.2, 0.2],
-                     stride=None,
-                     offset=0.5,
-                     name=None):
-    """
-    This op generate anchors for Faster RCNN algorithm.
-    Each position of the input produce N anchors, N =
-    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
-    is firstly aspect_ratios loop then anchor_sizes loop.
-    Args:
-       input(Tensor): 4-D Tensor with shape [N,C,H,W]. The input feature map.
-       anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated
-          anchors, given in absolute pixels e.g. [64., 128., 256., 512.].
-          For instance, the anchor size of 64 means the area of this anchor 
-          equals to 64**2. None by default.
-       aspect_ratios(float32|list|tuple, optional): The height / width ratios 
-           of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.
-       variance(list|tuple, optional): The variances to be used in box 
-           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by 
-           default.
-       stride(list|tuple, optional): The anchors stride across width and height.
-           The data type is float32. e.g. [16.0, 16.0]. None by default.
-       offset(float32, optional): Prior boxes center offset. 0.5 by default.
-       name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and None 
-           by default. 
-    Returns:
-        Tuple:
-        Anchors(Tensor): The output anchors with a layout of [H, W, num_anchors, 4].
-        H is the height of input, W is the width of input,
-        num_anchors is the box count of each position. 
-        Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        Variances(Tensor): The expanded variances of anchors
-        with a layout of [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_anchors is the box count of each position.
-        Each variance is in (xcenter, ycenter, w, h) format.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from ppdet.modeling import ops
-            paddle.enable_static()
-            conv1 = paddle.static.data(name='input', shape=[None, 48, 16, 16], dtype='float32')
-            anchor, var = ops.anchor_generator(
-                input=conv1,
-                anchor_sizes=[64, 128, 256, 512],
-                aspect_ratios=[0.5, 1.0, 2.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-    """
-    helper = LayerHelper("anchor_generator", **locals())
-    dtype = helper.input_dtype()
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-    if not _is_list_or_tuple_(anchor_sizes):
-        anchor_sizes = [anchor_sizes]
-    if not _is_list_or_tuple_(aspect_ratios):
-        aspect_ratios = [aspect_ratios]
-    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
-        raise ValueError('stride should be a list or tuple ',
-                         'with length 2, (stride_width, stride_height).')
-    anchor_sizes = list(map(float, anchor_sizes))
-    aspect_ratios = list(map(float, aspect_ratios))
-    stride = list(map(float, stride))
-    if in_dygraph_mode():
-        attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios,
-                 'variances', variance, 'stride', stride, 'offset', offset)
-        anchor, var = core.ops.anchor_generator(input, *attrs)
-        return anchor, var
-    else:
-        attrs = {
-            'anchor_sizes': anchor_sizes,
-            'aspect_ratios': aspect_ratios,
-            'variances': variance,
-            'stride': stride,
-            'offset': offset
-        }
-        anchor = helper.create_variable_for_type_inference(dtype)
-        var = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type="anchor_generator",
-            inputs={"Input": input},
-            outputs={"Anchors": anchor,
-                     "Variances": var},
-            attrs=attrs, )
-        anchor.stop_gradient = True
-        var.stop_gradient = True
-        return anchor, var
 @paddle.jit.not_to_static
 def multiclass_nms(bboxes,
                   scores,
@@ -966,7 +856,7 @@ def multiclass_nms(bboxes,
                   nms_threshold=0.3,
                   normalized=True,
                   nms_eta=1.,
-                   background_label=0,
+                   background_label=-1,
                   return_index=False,
                   return_rois_num=True,
                   rois_num=None,

--- a/dygraph/ppdet/modeling/post_process.py
+++ b/dygraph/ppdet/modeling/post_process.py
@@ -3,47 +3,140 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from ppdet.core.workspace import register
-from ppdet.py_op.post_process import mask_post_process
+from ppdet.modeling.bbox_utils import nonempty_bbox
 from . import ops
 @register
 class BBoxPostProcess(object):
+    __shared__ = ['num_classes']
    __inject__ = ['decode', 'nms']
-    def __init__(self, decode=None, nms=None):
+    def __init__(self, num_classes=80, decode=None, nms=None):
        super(BBoxPostProcess, self).__init__()
+        self.num_classes = num_classes
        self.decode = decode
        self.nms = nms
-    def __call__(self, head_out, rois, im_shape, scale_factor=None):
+    def __call__(self, head_out, rois, im_shape, scale_factor):
+        """
+        Decode the bbox and do NMS if needed. 
+        Returns:
+            bbox_pred(Tensor): The output is the prediction with shape [N, 6]
+                               including labels, scores and bboxes. The size of 
+                               bboxes are corresponding to the input image and 
+                               the bboxes may be used in other brunch.
+            bbox_num(Tensor): The number of prediction of each batch with shape
+                              [N, 6].
+        """
        if self.nms is not None:
            bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
-            bbox_pred, bbox_num, _ = self.nms(bboxes, score)
+            bbox_pred, bbox_num, _ = self.nms(bboxes, score, self.num_classes)
        else:
            bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,
                                              scale_factor)
        return bbox_pred, bbox_num
+    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
+        """
+        Rescale, clip and filter the bbox from the output of NMS to 
+        get final prediction.
+        Args:
+            bboxes(Tensor): The output of __call__ with shape [N, 6]
+        Returns:
+            bbox_pred(Tensor): The output is the prediction with shape [N, 6]
+                               including labels, scores and bboxes. The size of
+                               bboxes are corresponding to the original image.
+        """
+        assert bboxes.shape[0] > 0, 'There is no detection output'
+        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
+        origin_shape_list = []
+        scale_factor_list = []
+        # scale_factor: scale_y, scale_x
+        for i in range(bbox_num.shape[0]):
+            expand_shape = paddle.expand(origin_shape[i:i + 1, :],
+                                         [bbox_num[i], 2])
+            scale_y, scale_x = scale_factor[i]
+            scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
+            expand_scale = paddle.expand(scale, [bbox_num[i], 4])
+            origin_shape_list.append(expand_shape)
+            scale_factor_list.append(expand_scale)
+        self.origin_shape_list = paddle.concat(origin_shape_list)
+        scale_factor_list = paddle.concat(scale_factor_list)
+        # bboxes: [N, 6], label, score, bbox
+        pred_label = bboxes[:, 0:1]
+        pred_score = bboxes[:, 1:2]
+        pred_bbox = bboxes[:, 2:]
+        # rescale bbox to original image
+        scaled_bbox = pred_bbox / scale_factor_list
+        origin_h = self.origin_shape_list[:, 0]
+        origin_w = self.origin_shape_list[:, 1]
+        zeros = paddle.zeros_like(origin_h)
+        # clip bbox to [0, original_size]
+        x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros)
+        y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros)
+        x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros)
+        y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros)
+        pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
+        # filter empty bbox
+        keep_mask = nonempty_bbox(pred_bbox, return_mask=True)
+        keep_mask = paddle.unsqueeze(keep_mask, [1])
+        pred_label = paddle.where(keep_mask, pred_label,
+                                  paddle.ones_like(pred_label) * -1)
+        pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1)
+        return pred_result
+    def get_origin_shape(self, ):
+        return self.origin_shape_list
 @register
 class MaskPostProcess(object):
-    __shared__ = ['mask_resolution']
+    def __init__(self, binary_thresh=0.5):
-    def __init__(self, mask_resolution=28, binary_thresh=0.5):
        super(MaskPostProcess, self).__init__()
-        self.mask_resolution = mask_resolution
        self.binary_thresh = binary_thresh
-    def __call__(self, bboxes, mask_head_out, im_shape, scale_factor=None):
+    def paste_mask(self, masks, boxes, im_h, im_w):
-        # TODO: modify related ops for deploying
+        # paste each mask on image
-        bboxes_np = (i.numpy() for i in bboxes)
+        x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
-        mask = mask_post_process(bboxes_np,
+        masks = paddle.unsqueeze(masks, [0, 1])
-                                 mask_head_out.numpy(),
+        img_y = paddle.arange(0, im_h, dtype='float32') + 0.5
-                                 im_shape.numpy(), scale_factor[:, 0].numpy(),
+        img_x = paddle.arange(0, im_w, dtype='float32') + 0.5
-                                 self.mask_resolution, self.binary_thresh)
+        img_y = (img_y - y0) / (y1 - y0) * 2 - 1
-        mask = {'mask': mask}
+        img_x = (img_x - x0) / (x1 - x0) * 2 - 1
-        return mask
+        img_x = paddle.unsqueeze(img_x, [1])
+        img_y = paddle.unsqueeze(img_y, [2])
+        N = boxes.shape[0]
+        gx = paddle.expand(img_x, [N, img_y.shape[1], img_x.shape[2]])
+        gy = paddle.expand(img_y, [N, img_y.shape[1], img_x.shape[2]])
+        grid = paddle.stack([gx, gy], axis=3)
+        img_masks = F.grid_sample(masks, grid, align_corners=False)
+        return img_masks[:, 0]
+    def __call__(self, mask_out, bboxes, bbox_num, origin_shape):
+        """
+        Paste the mask prediction to the original image.
+        """
+        assert bboxes.shape[0] > 0, 'There is no detection output'
+        num_mask = mask_out.shape[0]
+        # TODO: support bs > 1
+        pred_result = paddle.zeros(
+            [num_mask, origin_shape[0][0], origin_shape[0][1]], dtype='bool')
+        # TODO: optimize chunk paste
+        for i in range(bboxes.shape[0]):
+            im_h, im_w = origin_shape[i]
+            pred_mask = self.paste_mask(mask_out[i], bboxes[i:i + 1, 2:], im_h,
+                                        im_w)
+            pred_mask = pred_mask >= self.binary_thresh
+            pred_result[i] = pred_mask
+        return pred_result
 @register

--- a/dygraph/ppdet/modeling/proposal_generator/__init__.py
+++ b/dygraph/ppdet/modeling/proposal_generator/__init__.py
+from . import rpn_head
+from .rpn_head import *
--- a/dygraph/ppdet/modeling/proposal_generator/anchor_generator.py
+++ b/dygraph/ppdet/modeling/proposal_generator/anchor_generator.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .. import ops
+@register
+class AnchorGenerator(object):
+    def __init__(self,
+                 anchor_sizes=[32, 64, 128, 256, 512],
+                 aspect_ratios=[0.5, 1.0, 2.0],
+                 strides=[16.0],
+                 variance=[1.0, 1.0, 1.0, 1.0],
+                 offset=0.):
+        super(AnchorGenerator, self).__init__()
+        self.anchor_sizes = anchor_sizes
+        self.aspect_ratios = aspect_ratios
+        self.strides = strides
+        self.variance = variance
+        self.cell_anchors = self._calculate_anchors(len(strides))
+        self.offset = offset
+    def _broadcast_params(self, params, num_features):
+        if not isinstance(params[0], (list, tuple)):  # list[float]
+            return [params] * num_features
+        if len(params) == 1:
+            return list(params) * num_features
+        return params
+    def generate_cell_anchors(self, sizes, aspect_ratios):
+        anchors = []
+        for size in sizes:
+            area = size**2.0
+            for aspect_ratio in aspect_ratios:
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
+                anchors.append([x0, y0, x1, y1])
+        return paddle.to_tensor(anchors, dtype='float32')
+    def _calculate_anchors(self, num_features):
+        sizes = self._broadcast_params(self.anchor_sizes, num_features)
+        aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features)
+        cell_anchors = [
+            self.generate_cell_anchors(s, a)
+            for s, a in zip(sizes, aspect_ratios)
+        ]
+        return cell_anchors
+    def _create_grid_offsets(self, size, stride, offset):
+        grid_height, grid_width = size
+        shifts_x = paddle.arange(
+            offset * stride, grid_width * stride, step=stride, dtype='float32')
+        shifts_y = paddle.arange(
+            offset * stride, grid_height * stride, step=stride, dtype='float32')
+        shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x)
+        shift_x = shift_x.reshape([-1])
+        shift_y = shift_y.reshape([-1])
+        return shift_x, shift_y
+    def _grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides,
+                                              self.cell_anchors):
+            shift_x, shift_y = self._create_grid_offsets(size, stride,
+                                                         self.offset)
+            shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1)
+            anchors.append((shifts.reshape([-1, 1, 4]) + base_anchors.reshape(
+                [1, -1, 4])).reshape([-1, 4]))
+        return anchors
+    def __call__(self, input):
+        grid_sizes = [feature_map.shape[-2:] for feature_map in input]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return anchors_over_all_feature_maps
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            int: number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios and 5 sizes, the number of anchors is 15.
+                For FPN models, `num_anchors` on every feature map is the same.
+        """
+        return self.cell_anchors[0].shape[0]
--- a/dygraph/ppdet/modeling/proposal_generator/proposal_generator.py
+++ b/dygraph/ppdet/modeling/proposal_generator/proposal_generator.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from .. import ops
+@register
+@serializable
+class ProposalGenerator(object):
+    def __init__(self,
+                 pre_nms_top_n=12000,
+                 post_nms_top_n=2000,
+                 nms_thresh=.5,
+                 min_size=.1,
+                 eta=1.,
+                 topk_after_collect=False):
+        super(ProposalGenerator, self).__init__()
+        self.pre_nms_top_n = pre_nms_top_n
+        self.post_nms_top_n = post_nms_top_n
+        self.nms_thresh = nms_thresh
+        self.min_size = min_size
+        self.eta = eta
+        self.topk_after_collect = topk_after_collect
+    def __call__(self, scores, bbox_deltas, anchors, im_shape):
+        top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n
+        variances = paddle.ones_like(anchors)
+        rpn_rois, rpn_rois_prob, rpn_rois_num = ops.generate_proposals(
+            scores,
+            bbox_deltas,
+            im_shape,
+            anchors,
+            variances,
+            pre_nms_top_n=self.pre_nms_top_n,
+            post_nms_top_n=top_n,
+            nms_thresh=self.nms_thresh,
+            min_size=self.min_size,
+            eta=self.eta,
+            return_rois_num=True)
+        return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n
--- a/dygraph/ppdet/modeling/proposal_generator/rpn_head.py
+++ b/dygraph/ppdet/modeling/proposal_generator/rpn_head.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register
+from ppdet.modeling import ops
+from .anchor_generator import AnchorGenerator
+from .target_layer import RPNTargetAssign
+from .proposal_generator import ProposalGenerator
+class RPNFeat(nn.Layer):
+    def __init__(self, feat_in=1024, feat_out=1024):
+        super(RPNFeat, self).__init__()
+        # rpn feat is shared with each level
+        self.rpn_conv = nn.Conv2D(
+            in_channels=feat_in,
+            out_channels=feat_out,
+            kernel_size=3,
+            padding=1,
+            weight_attr=paddle.ParamAttr(initializer=Normal(
+                mean=0., std=0.01)))
+    def forward(self, feats):
+        rpn_feats = []
+        for feat in feats:
+            rpn_feats.append(F.relu(self.rpn_conv(feat)))
+        return rpn_feats
+@register
+class RPNHead(nn.Layer):
+    def __init__(self,
+                 anchor_generator=AnchorGenerator().__dict__,
+                 rpn_target_assign=RPNTargetAssign().__dict__,
+                 train_proposal=ProposalGenerator(12000, 2000).__dict__,
+                 test_proposal=ProposalGenerator().__dict__,
+                 in_channel=1024):
+        super(RPNHead, self).__init__()
+        self.anchor_generator = anchor_generator
+        self.rpn_target_assign = rpn_target_assign
+        self.train_proposal = train_proposal
+        self.test_proposal = test_proposal
+        if isinstance(anchor_generator, dict):
+            self.anchor_generator = AnchorGenerator(**anchor_generator)
+        if isinstance(rpn_target_assign, dict):
+            self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign)
+        if isinstance(train_proposal, dict):
+            self.train_proposal = ProposalGenerator(**train_proposal)
+        if isinstance(test_proposal, dict):
+            self.test_proposal = ProposalGenerator(**test_proposal)
+        num_anchors = self.anchor_generator.num_anchors
+        self.rpn_feat = RPNFeat(in_channel, in_channel)
+        # rpn head is shared with each level
+        # rpn roi classification scores
+        self.rpn_rois_score = nn.Conv2D(
+            in_channels=in_channel,
+            out_channels=num_anchors,
+            kernel_size=1,
+            padding=0,
+            weight_attr=paddle.ParamAttr(initializer=Normal(
+                mean=0., std=0.01)))
+        # rpn roi bbox regression deltas
+        self.rpn_rois_delta = nn.Conv2D(
+            in_channels=in_channel,
+            out_channels=4 * num_anchors,
+            kernel_size=1,
+            padding=0,
+            weight_attr=paddle.ParamAttr(initializer=Normal(
+                mean=0., std=0.01)))
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # FPN share same rpn head
+        if isinstance(input_shape, (list, tuple)):
+            input_shape = input_shape[0]
+        return {'in_channel': input_shape.channels}
+    def forward(self, feats, inputs):
+        rpn_feats = self.rpn_feat(feats)
+        scores = []
+        deltas = []
+        for rpn_feat in rpn_feats:
+            rrs = self.rpn_rois_score(rpn_feat)
+            rrd = self.rpn_rois_delta(rpn_feat)
+            scores.append(rrs)
+            deltas.append(rrd)
+        anchors = self.anchor_generator(rpn_feats)
+        rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs)
+        if self.training:
+            loss = self.get_loss(scores, deltas, anchors, inputs)
+            return rois, rois_num, loss
+        else:
+            return rois, rois_num, None
+    def _gen_proposal(self, scores, bbox_deltas, anchors, inputs):
+        """
+        scores (list[Tensor]): Multi-level scores prediction 
+        bbox_deltas (list[Tensor]): Multi-level deltas prediction
+        anchors (list[Tensor]): Multi-level anchors 
+        inputs (dict): ground truth info
+        """
+        prop_gen = self.train_proposal if self.training else self.test_proposal
+        im_shape = inputs['im_shape']
+        batch_size = im_shape.shape[0]
+        rpn_rois_list = [[] for i in range(batch_size)]
+        rpn_prob_list = [[] for i in range(batch_size)]
+        rpn_rois_num_list = [[] for i in range(batch_size)]
+        # Generate proposals for each level and each batch.
+        # Discard batch-computing to avoid sorting bbox cross different batches.
+        for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, anchors):
+            for i in range(batch_size):
+                rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen(
+                    scores=rpn_score[i:i + 1],
+                    bbox_deltas=rpn_delta[i:i + 1],
+                    anchors=anchor,
+                    im_shape=im_shape[i:i + 1])
+                if rpn_rois.shape[0] > 0:
+                    rpn_rois_list[i].append(rpn_rois)
+                    rpn_prob_list[i].append(rpn_rois_prob)
+                    rpn_rois_num_list[i].append(rpn_rois_num)
+        # Collect multi-level proposals for each batch 
+        # Get 'topk' of them as final output 
+        rois_collect = []
+        rois_num_collect = []
+        for i in range(batch_size):
+            if len(scores) > 1:
+                rpn_rois = paddle.concat(rpn_rois_list[i])
+                rpn_prob = paddle.concat(rpn_prob_list[i]).flatten()
+                if rpn_prob.shape[0] > post_nms_top_n:
+                    topk_prob, topk_inds = paddle.topk(rpn_prob, post_nms_top_n)
+                    topk_rois = paddle.gather(rpn_rois, topk_inds)
+                else:
+                    topk_rois = rpn_rois
+                    topk_prob = rpn_prob
+            else:
+                topk_rois = rpn_rois_list[i][0]
+                topk_prob = rpn_prob_list[i][0].flatten()
+            rois_collect.append(topk_rois)
+            rois_num_collect.append(paddle.shape(topk_rois)[0])
+        rois_num_collect = paddle.concat(rois_num_collect)
+        return rois_collect, rois_num_collect
+    def get_loss(self, pred_scores, pred_deltas, anchors, inputs):
+        """
+        pred_scores (list[Tensor]): Multi-level scores prediction 
+        pred_deltas (list[Tensor]): Multi-level deltas prediction
+        anchors (list[Tensor]): Multi-level anchors
+        inputs (dict): ground truth info, including im, gt_bbox, gt_score
+        """
+        anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors]
+        anchors = paddle.concat(anchors)
+        scores = [
+            paddle.reshape(
+                paddle.transpose(
+                    v, perm=[0, 2, 3, 1]),
+                shape=(v.shape[0], -1, 1)) for v in pred_scores
+        ]
+        scores = paddle.concat(scores, axis=1)
+        deltas = [
+            paddle.reshape(
+                paddle.transpose(
+                    v, perm=[0, 2, 3, 1]),
+                shape=(v.shape[0], -1, 4)) for v in pred_deltas
+        ]
+        deltas = paddle.concat(deltas, axis=1)
+        score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs,
+                                                                    anchors)
+        scores = paddle.reshape(x=scores, shape=(-1, ))
+        deltas = paddle.reshape(x=deltas, shape=(-1, 4))
+        score_tgt = paddle.concat(score_tgt)
+        score_tgt.stop_gradient = True
+        pos_mask = score_tgt == 1
+        pos_ind = paddle.nonzero(pos_mask)
+        valid_mask = score_tgt >= 0
+        valid_ind = paddle.nonzero(valid_mask)
+        # cls loss
+        score_pred = paddle.gather(scores, valid_ind)
+        score_label = paddle.gather(score_tgt, valid_ind).cast('float32')
+        score_label.stop_gradient = True
+        loss_rpn_cls = F.binary_cross_entropy_with_logits(
+            logit=score_pred, label=score_label, reduction="sum")
+        # reg loss
+        loc_pred = paddle.gather(deltas, pos_ind)
+        loc_tgt = paddle.concat(loc_tgt)
+        loc_tgt = paddle.gather(loc_tgt, pos_ind)
+        loc_tgt.stop_gradient = True
+        loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum()
+        return {
+            'loss_rpn_cls': loss_rpn_cls / norm,
+            'loss_rpn_reg': loss_rpn_reg / norm
+        }
--- a/dygraph/ppdet/modeling/proposal_generator/target.py
+++ b/dygraph/ppdet/modeling/proposal_generator/target.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import math
+import numpy as np
+import paddle
+from ..bbox_utils import bbox2delta, bbox_overlaps
+import copy
+def rpn_anchor_target(anchors,
+                      gt_boxes,
+                      rpn_batch_size_per_im,
+                      rpn_positive_overlap,
+                      rpn_negative_overlap,
+                      rpn_fg_fraction,
+                      use_random=True,
+                      batch_size=1,
+                      weights=[1., 1., 1., 1.]):
+    tgt_labels = []
+    tgt_bboxes = []
+    tgt_deltas = []
+    for i in range(batch_size):
+        gt_bbox = gt_boxes[i]
+        # Step1: match anchor and gt_bbox
+        matches, match_labels, matched_vals = label_box(
+            anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True)
+        # Step2: sample anchor 
+        fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im,
+                                            rpn_fg_fraction, 0, use_random)
+        # Fill with the ignore label (-1), then set positive and negative labels
+        labels = paddle.full(match_labels.shape, -1, dtype='int32')
+        labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds))
+        labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds))
+        # Step3: make output  
+        matched_gt_boxes = paddle.gather(gt_bbox, matches)
+        tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights)
+        labels.stop_gradient = True
+        matched_gt_boxes.stop_gradient = True
+        tgt_delta.stop_gradient = True
+        tgt_labels.append(labels)
+        tgt_bboxes.append(matched_gt_boxes)
+        tgt_deltas.append(tgt_delta)
+    return tgt_labels, tgt_bboxes, tgt_deltas
+def label_box(anchors, gt_boxes, positive_overlap, negative_overlap,
+              allow_low_quality):
+    iou = bbox_overlaps(gt_boxes, anchors)
+    if iou.numel() == 0:
+        default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64')
+        default_match_labels = paddle.full((iou.shape[1], ), -1, dtype='int32')
+        return default_matches, default_match_labels
+    matched_vals, matches = paddle.topk(iou, k=1, axis=0)
+    match_labels = paddle.full(matches.shape, -1, dtype='int32')
+    match_labels = paddle.where(matched_vals < negative_overlap,
+                                paddle.zeros_like(match_labels), match_labels)
+    match_labels = paddle.where(matched_vals >= positive_overlap,
+                                paddle.ones_like(match_labels), match_labels)
+    if allow_low_quality:
+        highest_quality_foreach_gt = iou.max(axis=1, keepdim=True)
+        pred_inds_with_highest_quality = (
+            iou == highest_quality_foreach_gt).cast('int32').sum(0,
+                                                                 keepdim=True)
+        match_labels = paddle.where(pred_inds_with_highest_quality > 0,
+                                    paddle.ones_like(match_labels),
+                                    match_labels)
+    matches = matches.flatten()
+    match_labels = match_labels.flatten()
+    matched_vals = matched_vals.flatten()
+    return matches, match_labels, matched_vals
+def subsample_labels(labels,
+                     num_samples,
+                     fg_fraction,
+                     bg_label=0,
+                     use_random=True):
+    positive = paddle.nonzero(
+        paddle.logical_and(labels != -1, labels != bg_label))
+    negative = paddle.nonzero(labels == bg_label)
+    positive = positive.cast('int32').flatten()
+    negative = negative.cast('int32').flatten()
+    fg_num = int(num_samples * fg_fraction)
+    fg_num = min(positive.numel(), fg_num)
+    bg_num = num_samples - fg_num
+    bg_num = min(negative.numel(), bg_num)
+    # randomly select positive and negative examples
+    fg_perm = paddle.randperm(positive.numel(), dtype='int32')
+    fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num])
+    bg_perm = paddle.randperm(negative.numel(), dtype='int32')
+    bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num])
+    if use_random:
+        fg_inds = paddle.gather(positive, fg_perm)
+        bg_inds = paddle.gather(negative, bg_perm)
+    else:
+        fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num])
+        bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num])
+    return fg_inds, bg_inds
+def filter_roi(rois, max_overlap):
+    ws = rois[:, 2] - rois[:, 0]
+    hs = rois[:, 3] - rois[:, 1]
+    valid_mask = paddle.logical_and(ws > 0, hs > 0, max_overlap < 1)
+    keep = paddle.nonzero(valid_mask)
+    if keep.numel() > 0:
+        return rois[keep[:, 1]]
+    return paddle.zeros((1, 4), dtype='float32')
+def generate_proposal_target(rpn_rois,
+                             gt_classes,
+                             gt_boxes,
+                             batch_size_per_im,
+                             fg_fraction,
+                             fg_thresh,
+                             bg_thresh,
+                             num_classes,
+                             use_random=True,
+                             is_cascade_rcnn=False,
+                             max_overlaps=None):
+    rois_with_gt = []
+    tgt_labels = []
+    tgt_bboxes = []
+    sampled_max_overlaps = []
+    tgt_gt_inds = []
+    new_rois_num = []
+    for i, rpn_roi in enumerate(rpn_rois):
+        max_overlap = max_overlaps[i] if is_cascade_rcnn else None
+        gt_bbox = gt_boxes[i]
+        gt_classes = gt_classes[i]
+        if is_cascade_rcnn:
+            rpn_roi = filter_roi(rpn_roi, max_overlap)
+        bbox = paddle.concat([rpn_roi, gt_bbox])
+        # Step1: label bbox 
+        matches, match_labels, matched_vals = label_box(
+            bbox, gt_bbox, fg_thresh, bg_thresh, False)
+        # Step2: sample bbox 
+        sampled_inds, sampled_gt_classes = sample_bbox(
+            matches, match_labels, gt_classes, batch_size_per_im, fg_fraction,
+            num_classes, use_random)
+        # Step3: make output 
+        rois_per_image = paddle.gather(bbox, sampled_inds)
+        sampled_gt_ind = paddle.gather(matches, sampled_inds)
+        sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)
+        sampled_overlap = paddle.gather(matched_vals, sampled_inds)
+        rois_per_image.stop_gradient = True
+        sampled_gt_ind.stop_gradient = True
+        sampled_bbox.stop_gradient = True
+        sampled_overlap.stop_gradient = True
+        tgt_labels.append(sampled_gt_classes)
+        tgt_bboxes.append(sampled_bbox)
+        rois_with_gt.append(rois_per_image)
+        sampled_max_overlaps.append(sampled_overlap)
+        tgt_gt_inds.append(sampled_gt_ind)
+        new_rois_num.append(paddle.shape(sampled_inds)[0])
+    new_rois_num = paddle.concat(new_rois_num)
+    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num, sampled_max_overlaps
+def sample_bbox(
+        matches,
+        match_labels,
+        gt_classes,
+        batch_size_per_im,
+        fg_fraction,
+        num_classes,
+        use_random=True, ):
+    gt_classes = paddle.gather(gt_classes, matches)
+    gt_classes = paddle.where(match_labels == 0,
+                              paddle.ones_like(gt_classes) * num_classes,
+                              gt_classes)
+    gt_classes = paddle.where(match_labels == -1,
+                              paddle.ones_like(gt_classes) * -1, gt_classes)
+    rois_per_image = int(batch_size_per_im)
+    fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction,
+                                        num_classes, use_random)
+    sampled_inds = paddle.concat([fg_inds, bg_inds])
+    sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)
+    return sampled_inds, sampled_gt_classes
+def _strip_pad(gt_polys):
+    new_gt_polys = []
+    for i in range(gt_polys.shape[0]):
+        gt_segs = []
+        for j in range(gt_polys[i].shape[0]):
+            new_poly = []
+            polys = gt_polys[i][j]
+            for ii in range(polys.shape[0]):
+                x, y = polys[ii]
+                if (x == -1 and y == -1):
+                    continue
+                elif (x >= 0 or y >= 0):
+                    new_poly.extend([x, y])  # array, one poly
+            if len(new_poly) > 6:
+                gt_segs.append(np.array(new_poly).astype('float64'))
+        new_gt_polys.append(gt_segs)
+    return new_gt_polys
+def polygons_to_mask(polygons, height, width):
+    """
+    Args:
+        polygons (list[ndarray]): each array has shape (Nx2,)
+        height, width (int)
+    Returns:
+        ndarray: a bool mask of shape (height, width)
+    """
+    import pycocotools.mask as mask_util
+    assert len(polygons) > 0, "COCOAPI does not support empty polygons"
+    rles = mask_util.frPyObjects(polygons, height, width)
+    rle = mask_util.merge(rles)
+    return mask_util.decode(rle).astype(np.bool)
+def rasterize_polygons_within_box(poly, box, resolution):
+    w, h = box[2] - box[0], box[3] - box[1]
+    polygons = copy.deepcopy(poly)
+    for p in polygons:
+        p[0::2] = p[0::2] - box[0]
+        p[1::2] = p[1::2] - box[1]
+    ratio_h = resolution / max(h, 0.1)
+    ratio_w = resolution / max(w, 0.1)
+    if ratio_h == ratio_w:
+        for p in polygons:
+            p *= ratio_h
+    else:
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+    # 3. Rasterize the polygons with coco api
+    mask = polygons_to_mask(polygons, resolution, resolution)
+    mask = paddle.to_tensor(mask, dtype='int32')
+    return mask
+def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,
+                         num_classes, resolution):
+    mask_rois = []
+    mask_rois_num = []
+    tgt_masks = []
+    tgt_classes = []
+    mask_index = []
+    tgt_weights = []
+    for k in range(len(rois)):
+        has_fg = True
+        rois_per_im = rois[k]
+        gt_segms_per_im = gt_segms[k]
+        labels_per_im = labels_int32[k]
+        fg_inds = paddle.nonzero(
+            paddle.logical_and(labels_per_im != -1, labels_per_im !=
+                               num_classes))
+        if fg_inds.numel() == 0:
+            has_fg = False
+            fg_inds = paddle.ones([1], dtype='int32')
+        inds_per_im = sampled_gt_inds[k]
+        inds_per_im = paddle.gather(inds_per_im, fg_inds)
+        gt_segms_per_im = paddle.gather(gt_segms_per_im, inds_per_im)
+        fg_rois = paddle.gather(rois_per_im, fg_inds)
+        fg_classes = paddle.gather(labels_per_im, fg_inds)
+        fg_segms = paddle.gather(gt_segms_per_im, fg_inds)
+        weight = paddle.ones([fg_rois.shape[0]], dtype='float32')
+        if not has_fg:
+            weight = weight - 1
+        # remove padding
+        gt_polys = fg_segms.numpy()
+        boxes = fg_rois.numpy()
+        new_gt_polys = _strip_pad(gt_polys)
+        results = [
+            rasterize_polygons_within_box(poly, box, resolution)
+            for poly, box in zip(new_gt_polys, boxes)
+        ]
+        tgt_mask = paddle.stack(results)
+        tgt_mask.stop_gradient = True
+        fg_rois.stop_gradient = True
+        mask_index.append(fg_inds)
+        mask_rois.append(fg_rois)
+        mask_rois_num.append(paddle.shape(fg_rois)[0])
+        tgt_classes.append(fg_classes)
+        tgt_masks.append(tgt_mask)
+        tgt_weights.append(weight)
+    mask_index = paddle.concat(mask_index)
+    mask_rois_num = paddle.concat(mask_rois_num)
+    tgt_classes = paddle.concat(tgt_classes, axis=0)
+    tgt_masks = paddle.concat(tgt_masks, axis=0)
+    tgt_weights = paddle.concat(tgt_weights, axis=0)
+    return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights
--- a/dygraph/ppdet/modeling/proposal_generator/target_layer.py
+++ b/dygraph/ppdet/modeling/proposal_generator/target_layer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+import paddle
+from ppdet.core.workspace import register, serializable
+from .target import rpn_anchor_target, generate_proposal_target, generate_mask_target
+@register
+@serializable
+class RPNTargetAssign(object):
+    def __init__(self,
+                 batch_size_per_im=256,
+                 fg_fraction=0.5,
+                 positive_overlap=0.7,
+                 negative_overlap=0.3,
+                 use_random=True):
+        super(RPNTargetAssign, self).__init__()
+        self.batch_size_per_im = batch_size_per_im
+        self.fg_fraction = fg_fraction
+        self.positive_overlap = positive_overlap
+        self.negative_overlap = negative_overlap
+        self.use_random = use_random
+    def __call__(self, inputs, anchors):
+        """
+        inputs: ground-truth instances.
+        anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps.
+        """
+        gt_boxes = inputs['gt_bbox']
+        batch_size = gt_boxes.shape[0]
+        tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target(
+            anchors, gt_boxes, self.batch_size_per_im, self.positive_overlap,
+            self.negative_overlap, self.fg_fraction, self.use_random,
+            batch_size)
+        norm = self.batch_size_per_im * batch_size
+        return tgt_labels, tgt_bboxes, tgt_deltas, norm
+@register
+class BBoxAssigner(object):
+    __shared__ = ['num_classes']
+    def __init__(self,
+                 batch_size_per_im=512,
+                 fg_fraction=.25,
+                 fg_thresh=[.5, ],
+                 bg_thresh=[.5, ],
+                 use_random=True,
+                 is_cls_agnostic=False,
+                 num_classes=80):
+        super(BBoxAssigner, self).__init__()
+        self.batch_size_per_im = batch_size_per_im
+        self.fg_fraction = fg_fraction
+        self.fg_thresh = fg_thresh
+        self.bg_thresh = bg_thresh
+        self.use_random = use_random
+        self.is_cls_agnostic = is_cls_agnostic
+        self.num_classes = num_classes
+    def __call__(self,
+                 rpn_rois,
+                 rpn_rois_num,
+                 inputs,
+                 stage=0,
+                 max_overlap=None):
+        is_cascade = True if stage > 0 else False
+        gt_classes = inputs['gt_class']
+        gt_boxes = inputs['gt_bbox']
+        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds
+        # new_rois_num, sampled_max_overlaps
+        outs = generate_proposal_target(
+            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,
+            self.fg_fraction, self.fg_thresh[stage], self.bg_thresh[stage],
+            self.num_classes, self.use_random, is_cascade, max_overlap)
+        rois = outs[0]
+        rois_num = outs[-2]
+        max_overlaps = outs[-1]
+        # tgt_labels, tgt_bboxes, tgt_gt_inds
+        targets = outs[1:4]
+        return rois, rois_num, max_overlaps, targets
+@register
+@serializable
+class MaskAssigner(object):
+    __shared__ = ['num_classes', 'mask_resolution']
+    def __init__(self, num_classes=80, mask_resolution=14):
+        super(MaskAssigner, self).__init__()
+        self.num_classes = num_classes
+        self.mask_resolution = mask_resolution
+    def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs):
+        gt_segms = inputs['gt_poly']
+        outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds,
+                                    self.num_classes, self.mask_resolution)
+        # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights
+        return outs
--- a/dygraph/ppdet/modeling/shape_spec.py
+++ b/dygraph/ppdet/modeling/shape_spec.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+from collections import namedtuple
+class ShapeSpec(
+        namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to complement the lack of shape inference ability among paddle modules.
+    Attributes:
+        channels:
+        height:
+        width:
+        stride:
+    """
+    def __new__(cls, *, channels=None, height=None, width=None, stride=None):
+        return super().__new__(cls, channels, height, width, stride)
--- a/dygraph/ppdet/py_op/__init__.py
+++ b/dygraph/ppdet/py_op/__init__.py
-from .bbox import *
-from .mask import *
-from .target import *
 from .post_process import *
--- a/dygraph/ppdet/py_op/bbox.py
+++ b/dygraph/ppdet/py_op/bbox.py
-import numpy as np
-from numba import jit
-@jit
-def bbox2delta(bboxes1, bboxes2, weights):
-    ex_w = bboxes1[:, 2] - bboxes1[:, 0] + 1
-    ex_h = bboxes1[:, 3] - bboxes1[:, 1] + 1
-    ex_ctr_x = bboxes1[:, 0] + 0.5 * ex_w
-    ex_ctr_y = bboxes1[:, 1] + 0.5 * ex_h
-    gt_w = bboxes2[:, 2] - bboxes2[:, 0] + 1
-    gt_h = bboxes2[:, 3] - bboxes2[:, 1] + 1
-    gt_ctr_x = bboxes2[:, 0] + 0.5 * gt_w
-    gt_ctr_y = bboxes2[:, 1] + 0.5 * gt_h
-    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
-    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
-    dw = (np.log(gt_w / ex_w)) / weights[2]
-    dh = (np.log(gt_h / ex_h)) / weights[3]
-    deltas = np.vstack([dx, dy, dw, dh]).transpose()
-    return deltas
-@jit
-def delta2bbox(deltas, boxes, weights, bbox_clip=4.13):
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-    boxes = boxes.astype(deltas.dtype, copy=False)
-    widths = boxes[:, 2] - boxes[:, 0] + 1.0
-    heights = boxes[:, 3] - boxes[:, 1] + 1.0
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] * wx
-    dy = deltas[:, 1::4] * wy
-    dw = deltas[:, 2::4] * ww
-    dh = deltas[:, 3::4] * wh
-    # Prevent sending too large values into np.exp()
-    dw = np.minimum(dw, bbox_clip)
-    dh = np.minimum(dh, bbox_clip)
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    # x1
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
-    # y1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
-    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
-    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
-    return pred_boxes
-@jit
-def expand_bbox(bboxes, scale):
-    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
-    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
-    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
-    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
-    w_half *= scale
-    h_half *= scale
-    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
-    bboxes_exp[:, 0] = x_c - w_half
-    bboxes_exp[:, 2] = x_c + w_half
-    bboxes_exp[:, 1] = y_c - h_half
-    bboxes_exp[:, 3] = y_c + h_half
-    return bboxes_exp
-@jit
-def clip_bbox(boxes, im_shape):
-    assert boxes.shape[1] % 4 == 0, \
-        'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
-        boxes.shape[1]
-    )
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
-    return boxes
-@jit
-def bbox_overlaps(bboxes1, bboxes2):
-    w1 = np.maximum(bboxes1[:, 2] - bboxes1[:, 0] + 1, 0)
-    h1 = np.maximum(bboxes1[:, 3] - bboxes1[:, 1] + 1, 0)
-    w2 = np.maximum(bboxes2[:, 2] - bboxes2[:, 0] + 1, 0)
-    h2 = np.maximum(bboxes2[:, 3] - bboxes2[:, 1] + 1, 0)
-    area1 = w1 * h1
-    area2 = w2 * h2
-    boxes1_x1, boxes1_y1, boxes1_x2, boxes1_y2 = np.split(bboxes1, 4, axis=1)
-    boxes2_x1, boxes2_y1, boxes2_x2, boxes2_y2 = np.split(bboxes2, 4, axis=1)
-    all_pairs_min_ymax = np.minimum(boxes1_y2, np.transpose(boxes2_y2))
-    all_pairs_max_ymin = np.maximum(boxes1_y1, np.transpose(boxes2_y1))
-    inter_h = np.maximum(all_pairs_min_ymax - all_pairs_max_ymin + 1, 0.)
-    all_pairs_min_xmax = np.minimum(boxes1_x2, np.transpose(boxes2_x2))
-    all_pairs_max_xmin = np.maximum(boxes1_x1, np.transpose(boxes2_x1))
-    inter_w = np.maximum(all_pairs_min_xmax - all_pairs_max_xmin + 1, 0.)
-    inter_area = inter_w * inter_h
-    union_area = np.expand_dims(area1, 1) + np.expand_dims(area2, 0)
-    overlaps = inter_area / (union_area - inter_area)
-    return overlaps
-@jit
-def nms(dets, thresh):
-    if dets.shape[0] == 0:
-        return []
-    scores = dets[:, 0]
-    x1 = dets[:, 1]
-    y1 = dets[:, 2]
-    x2 = dets[:, 3]
-    y2 = dets[:, 4]
-    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-    order = scores.argsort()[::-1]
-    ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int)
-    for _i in range(ndets):
-        i = order[_i]
-        if suppressed[i] == 1:
-            continue
-        ix1 = x1[i]
-        iy1 = y1[i]
-        ix2 = x2[i]
-        iy2 = y2[i]
-        iarea = areas[i]
-        for _j in range(_i + 1, ndets):
-            j = order[_j]
-            if suppressed[j] == 1:
-                continue
-            xx1 = max(ix1, x1[j])
-            yy1 = max(iy1, y1[j])
-            xx2 = min(ix2, x2[j])
-            yy2 = min(iy2, y2[j])
-            w = max(0.0, xx2 - xx1 + 1)
-            h = max(0.0, yy2 - yy1 + 1)
-            inter = w * h
-            ovr = inter / (iarea + areas[j] - inter)
-            if ovr >= thresh:
-                suppressed[j] = 1
-    return np.where(suppressed == 0)[0]
-def nms_with_decode(bboxes,
-                    bbox_probs,
-                    bbox_deltas,
-                    im_info,
-                    keep_top_k=100,
-                    score_thresh=0.05,
-                    nms_thresh=0.5,
-                    class_nums=81,
-                    bbox_reg_weights=[0.1, 0.1, 0.2, 0.2]):
-    bboxes_num = [0, bboxes.shape[0]]
-    bboxes_v = np.array(bboxes)
-    bbox_probs_v = np.array(bbox_probs)
-    bbox_deltas_v = np.array(bbox_deltas)
-    variance_v = np.array(bbox_reg_weights)
-    im_results = [[] for _ in range(len(bboxes_num) - 1)]
-    new_bboxes_num = [0]
-    for i in range(len(bboxes_num) - 1):
-        start = bboxes_num[i]
-        end = bboxes_num[i + 1]
-        if start == end:
-            continue
-        bbox_deltas_n = bbox_deltas_v[start:end, :]  # box delta 
-        rois_n = bboxes_v[start:end, :]  # box 
-        rois_n = rois_n / im_info[i][2]  # scale 
-        rois_n = delta2bbox(bbox_deltas_n, rois_n, variance_v)
-        rois_n = clip_bbox(rois_n, np.round(im_info[i][:2] / im_info[i][2]))
-        cls_boxes = [[] for _ in range(class_nums)]
-        scores_n = bbox_probs_v[start:end, :]
-        for j in range(1, class_nums):
-            inds = np.where(scores_n[:, j] > score_thresh)[0]
-            scores_j = scores_n[inds, j]
-            rois_j = rois_n[inds, j * 4:(j + 1) * 4]
-            dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype(
-                np.float32, copy=False)
-            keep = nms(dets_j, nms_thresh)
-            nms_dets = dets_j[keep, :]
-            #add labels
-            label = np.array([j for _ in range(len(keep))])
-            nms_dets = np.hstack((label[:, np.newaxis], nms_dets)).astype(
-                np.float32, copy=False)
-            cls_boxes[j] = nms_dets
-        # Limit to max_per_image detections **over all classes**
-        image_scores = np.hstack(
-            [cls_boxes[j][:, 1] for j in range(1, class_nums)])
-        if len(image_scores) > keep_top_k:
-            image_thresh = np.sort(image_scores)[-keep_top_k]
-            for j in range(1, class_nums):
-                keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0]
-                cls_boxes[j] = cls_boxes[j][keep, :]
-        im_results_n = np.vstack([cls_boxes[j] for j in range(1, class_nums)])
-        im_results[i] = im_results_n
-        new_bboxes_num.append(len(im_results_n) + new_bboxes_num[-1])
-        labels = im_results_n[:, 0]
-        scores = im_results_n[:, 1]
-        boxes = im_results_n[:, 2:]
-    im_results = np.vstack([im_results[k] for k in range(len(bboxes_num) - 1)])
-    new_bboxes_num = np.array(new_bboxes_num)
-    return new_bboxes_num, im_results
-@jit
-def compute_bbox_targets(bboxes1, bboxes2, labels, bbox_reg_weights):
-    assert bboxes1.shape[0] == bboxes2.shape[0]
-    assert bboxes1.shape[1] == 4
-    assert bboxes2.shape[1] == 4
-    targets = np.zeros(bboxes1.shape)
-    bbox_reg_weights = np.asarray(bbox_reg_weights)
-    targets = bbox2delta(
-        bboxes1=bboxes1, bboxes2=bboxes2, weights=bbox_reg_weights)
-    return np.hstack([labels[:, np.newaxis], targets]).astype(
-        np.float32, copy=False)
-#@jit
-def expand_bbox_targets(bbox_targets_input,
-                        class_nums=81,
-                        is_cls_agnostic=False):
-    class_labels = bbox_targets_input[:, 0]
-    fg_inds = np.where(class_labels > 0)[0]
-    if is_cls_agnostic:
-        class_nums = 2
-    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums))
-    bbox_inside_weights = np.zeros(bbox_targets.shape)
-    for ind in fg_inds:
-        class_label = int(class_labels[ind]) if not is_cls_agnostic else 1
-        start_ind = class_label * 4
-        end_ind = class_label * 4 + 4
-        bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:]
-        bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0)
-    return bbox_targets, bbox_inside_weights
--- a/dygraph/ppdet/py_op/mask.py
+++ b/dygraph/ppdet/py_op/mask.py
-import six
-import math
-import numpy as np
-from numba import jit
-@jit
-def decode(cnts, m):
-    v = 0
-    mask = []
-    for j in range(m):
-        for k in range(cnts[j]):
-            mask.append(v)
-        v = 1 - v
-    return mask
-#@jit 
-def poly2mask(xy, k, h, w):
-    scale = 5.
-    x = [int(scale * p + 0.5) for p in xy[::2]]
-    x = x + [x[0]]
-    y = [int(scale * p + 0.5) for p in xy[1::2]]
-    y = y + [y[0]]
-    m = sum([
-        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
-        for j in range(k)
-    ])
-    u, v = [], []
-    for j in range(k):
-        xs = x[j]
-        xe = x[j + 1]
-        ys = y[j]
-        ye = y[j + 1]
-        dx = abs(xe - xs)
-        dy = abs(ys - ye)
-        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
-        if flip:
-            xs, xe = xe, xs
-            ys, ye = ye, ys
-        if dx >= dy:
-            if (dx == 0):
-                assert ye - ys == 0
-            s = 0 if dx == 0 else float(ye - ys) / dx
-        else:
-            if (dy == 0):
-                assert xe - xs == 0
-            s = 0 if dy == 0 else float(xe - xs) / dy
-        if dx >= dy:
-            ts = [dx - d if flip else d for d in range(dx + 1)]
-            u.extend([xs + t for t in ts])
-            v.extend([int(ys + s * t + .5) for t in ts])
-        else:
-            ts = [dy - d if flip else d for d in range(dy + 1)]
-            v.extend([t + ys for t in ts])
-            u.extend([int(xs + s * t + .5) for t in ts])
-    k = len(u)
-    x = np.zeros((k), np.int)
-    y = np.zeros((k), np.int)
-    m = 0
-    for j in six.moves.xrange(1, k):
-        if u[j] != u[j - 1]:
-            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
-            xd = (xd + .5) / scale - .5
-            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
-                continue
-            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
-            yd = (yd + .5) / scale - .5
-            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
-            x[m] = int(xd)
-            y[m] = int(yd)
-            m += 1
-    k = m
-    a = [int(x[i] * h + y[i]) for i in range(k)]
-    a.append(h * w)
-    a.sort()
-    b = [0] + a[:len(a) - 1]
-    a = [c - d for (c, d) in zip(a, b)]
-    k += 1
-    b = [0 for i in range(k)]
-    b[0] = a[0]
-    m, j = 1, 1
-    while (j < k):
-        if a[j] > 0:
-            b[m] = a[j]
-            m += 1
-            j += 1
-        else:
-            j += 1
-            if (j < k):
-                b[m - 1] += a[j]
-                j += 1
-    mask = decode(b, m)
-    mask = np.array(mask, dtype=np.int).reshape((w, h))
-    mask = mask.transpose((1, 0))
-    return mask
-def polys_to_boxes(polys):
-    """Convert a list of polygons into an array of tight bounding boxes."""
-    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
-    for j in range(len(polys)):
-        x_min, y_min = 10000000, 10000000
-        x_max, y_max = 0, 0
-        for i in range(len(polys[j])):
-            poly = polys[j][i]
-            x0 = min(min(p[::2]) for p in poly)
-            x_min = min(x0, x_min)
-            y0 = min(min(p[1::2]) for p in poly)
-            y_min = min(y0, y_min)
-            x1 = max(max(p[::2]) for p in poly)
-            x_max = max(x_max, x1)
-            y1 = max(max(p[1::2]) for p in poly)
-            y_max = max(y1, y_max)
-        boxes_from_polys[j, :] = [x_min, y_min, x_max, y_max]
-    return boxes_from_polys
-@jit
-def bbox_overlaps_mask(boxes, query_boxes):
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    overlaps = np.zeros((N, K), dtype=boxes.dtype)
-    for k in range(K):
-        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
-                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
-        for n in range(N):
-            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
-                 max(boxes[n, 0], query_boxes[k, 0]) + 1
-            if iw > 0:
-                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
-                     max(boxes[n, 1], query_boxes[k, 1]) + 1
-                if ih > 0:
-                    ua = float(
-                         (boxes[n, 2] - boxes[n, 0] + 1) *\
-                         (boxes[n, 3] - boxes[n, 1] + 1) +\
-                         box_area - iw * ih)
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
-@jit
-def polys_to_mask_wrt_box(polygons, box, M):
-    """Convert from the COCO polygon segmentation format to a binary mask
-    encoded as a 2D array of data type numpy.float32. The polygon segmentation
-    is understood to be enclosed in the given box and rasterized to an M x M
-    mask. The resulting mask is therefore of shape (M, M).
-    """
-    w = box[2] - box[0]
-    h = box[3] - box[1]
-    w = np.maximum(w, 1)
-    h = np.maximum(h, 1)
-    polygons_norm = []
-    i = 0
-    for poly in polygons:
-        p = np.array(poly, dtype=np.float32)
-        p = p.reshape(-1)
-        p[0::2] = (p[0::2] - box[0]) * M / w
-        p[1::2] = (p[1::2] - box[1]) * M / h
-        polygons_norm.append(p)
-    mask = []
-    for polygons in polygons_norm:
-        assert polygons.shape[0] % 2 == 0, polygons.shape
-        k = polygons.shape[0] // 2
-        one_msk = poly2mask(polygons, k, M, M)
-        mask.append(one_msk)
-    mask = np.array(mask)
-    # Flatten in case polygons was a list
-    mask = np.sum(mask, axis=0)
-    mask = np.array(mask > 0, dtype=np.float32)
-    return mask
-#@jit
-def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
-    """Expand masks from shape (#masks, resolution ** 2)
-    to (#masks, #classes * resolution ** 2) to encode class
-    specific mask targets.
-    """
-    assert masks.shape[0] == mask_class_labels.shape[0]
-    # Target values of -1 are "don't care" / ignore labels
-    mask_targets = -np.ones(
-        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
-    for i in range(masks.shape[0]):
-        cls = int(mask_class_labels[i])
-        start = resolution**2 * cls
-        end = start + resolution**2
-        # Ignore background instance
-        # (only happens when there is no fg samples in an image)
-        if cls > 0:
-            mask_targets[i, start:end] = masks[i, :]
-    return mask_targets
--- a/dygraph/ppdet/py_op/post_process.py
+++ b/dygraph/ppdet/py_op/post_process.py
 import six
 import os
 import numpy as np
-from numba import jit
-from .bbox import delta2bbox, clip_bbox, expand_bbox, nms
-import pycocotools.mask as mask_util
 import cv2
-def bbox_post_process(bboxes,
+def get_det_res(bboxes, scores, labels, bbox_nums, image_id,
-                      bbox_prob,
+                label_to_cat_id_map):
-                      bbox_deltas,
-                      im_shape,
-                      scale_factor,
-                      keep_top_k=100,
-                      score_thresh=0.05,
-                      nms_thresh=0.5,
-                      class_nums=81,
-                      bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                      with_background=True):
-    bbox, bbox_num = bboxes
-    new_bbox = [[] for _ in range(len(bbox_num))]
-    new_bbox_num = []
-    st_num = 0
-    end_num = 0
-    for i in range(len(bbox_num)):
-        box_num = bbox_num[i]
-        end_num += box_num
-        boxes = bbox[st_num:end_num, :]  # bbox 
-        boxes = boxes / scale_factor[i]  # scale
-        bbox_delta = bbox_deltas[st_num:end_num, :, :]  # bbox delta 
-        bbox_delta = np.reshape(bbox_delta, (box_num, -1))
-        # step1: decode 
-        boxes = delta2bbox(bbox_delta, boxes, bbox_reg_weights)
-        # step2: clip 
-        boxes = clip_bbox(boxes, im_shape[i][:2] / scale_factor[i])
-        # step3: nms 
-        cls_boxes = [[] for _ in range(class_nums)]
-        scores_n = bbox_prob[st_num:end_num, :]
-        for j in range(with_background, class_nums):
-            inds = np.where(scores_n[:, j] > score_thresh)[0]
-            scores_j = scores_n[inds, j]
-            rois_j = boxes[inds, j * 4:(j + 1) * 4]
-            dets_j = np.hstack((scores_j[:, np.newaxis], rois_j)).astype(
-                np.float32, copy=False)
-            keep = nms(dets_j, nms_thresh)
-            nms_dets = dets_j[keep, :]
-            #add labels
-            label = np.array([j for _ in range(len(keep))])
-            nms_dets = np.hstack((label[:, np.newaxis], nms_dets)).astype(
-                np.float32, copy=False)
-            cls_boxes[j] = nms_dets
-        st_num += box_num
-        # Limit to max_per_image detections **over all classes**
-        image_scores = np.hstack(
-            [cls_boxes[j][:, 1] for j in range(with_background, class_nums)])
-        if len(image_scores) > keep_top_k:
-            image_thresh = np.sort(image_scores)[-keep_top_k]
-            for j in range(with_background, class_nums):
-                keep = np.where(cls_boxes[j][:, 1] >= image_thresh)[0]
-                cls_boxes[j] = cls_boxes[j][keep, :]
-        new_bbox_n = np.vstack(
-            [cls_boxes[j] for j in range(with_background, class_nums)])
-        new_bbox[i] = new_bbox_n
-        new_bbox_num.append(len(new_bbox_n))
-    new_bbox = np.vstack([new_bbox[k] for k in range(len(bbox_num))])
-    new_bbox_num = np.array(new_bbox_num).astype('int32')
-    return new_bbox, new_bbox_num
-@jit
-def mask_post_process(det_res,
-                      im_shape,
-                      scale_factor,
-                      resolution=14,
-                      binary_thresh=0.5):
-    bbox = det_res['bbox']
-    bbox_num = det_res['bbox_num']
-    masks = det_res['mask']
-    if masks.shape[0] == 0:
-        return masks
-    M = resolution
-    scale = (M + 2.0) / M
-    boxes = bbox[:, 2:]
-    labels = bbox[:, 0]
-    segms_results = [[] for _ in range(len(bbox_num))]
-    sum = 0
-    st_num = 0
-    end_num = 0
-    for i in range(len(bbox_num)):
-        length = bbox_num[i]
-        end_num += length
-        cls_segms = []
-        boxes_n = boxes[st_num:end_num]
-        labels_n = labels[st_num:end_num]
-        masks_n = masks[st_num:end_num]
-        im_h = int(round(im_shape[i][0] / scale_factor[i, 0]))
-        im_w = int(round(im_shape[i][1] / scale_factor[i, 0]))
-        boxes_n = expand_bbox(boxes_n, scale)
-        boxes_n = boxes_n.astype(np.int32)
-        padded_mask = np.zeros((M + 2, M + 2), dtype=np.float32)
-        for j in range(len(boxes_n)):
-            class_id = int(labels_n[j])
-            padded_mask[1:-1, 1:-1] = masks_n[j, class_id, :, :]
-            ref_box = boxes_n[j, :]
-            w = ref_box[2] - ref_box[0] + 1
-            h = ref_box[3] - ref_box[1] + 1
-            w = np.maximum(w, 1)
-            h = np.maximum(h, 1)
-            mask = cv2.resize(padded_mask, (w, h))
-            mask = np.array(mask > binary_thresh, dtype=np.uint8)
-            im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
-            x_0 = max(ref_box[0], 0)
-            x_1 = min(ref_box[2] + 1, im_w)
-            y_0 = max(ref_box[1], 0)
-            y_1 = min(ref_box[3] + 1, im_h)
-            im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[
-                1]), (x_0 - ref_box[0]):(x_1 - ref_box[0])]
-            sum += im_mask.sum()
-            rle = mask_util.encode(
-                np.array(
-                    im_mask[:, :, np.newaxis], order='F'))[0]
-            cls_segms.append(rle)
-        segms_results[i] = np.array(cls_segms)[:, np.newaxis]
-        st_num += length
-    segms_results = np.vstack([segms_results[k] for k in range(len(bbox_num))])
-    bboxes = np.hstack([segms_results, bbox])
-    return bboxes[:, :3]
-@jit
-def get_det_res(bboxes, bbox_nums, image_id, num_id_to_cat_id_map):
    det_res = []
    k = 0
    for i in range(len(bbox_nums)):
        cur_image_id = int(image_id[i][0])
        det_nums = bbox_nums[i]
        for j in range(det_nums):
-            dt = bboxes[k]
+            box = bboxes[k]
+            score = float(scores[k])
+            label = int(labels[k])
+            if label < 0: continue
            k = k + 1
-            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
+            xmin, ymin, xmax, ymax = box.tolist()
-            if num_id < 0:
+            category_id = label_to_cat_id_map[label]
-                continue
+            w = xmax - xmin
-            category_id = num_id_to_cat_id_map[num_id]
+            h = ymax - ymin
-            w = xmax - xmin + 1
-            h = ymax - ymin + 1
            bbox = [xmin, ymin, w, h]
            dt_res = {
                'image_id': cur_image_id,
@@ -163,25 +32,30 @@ def get_det_res(bboxes, bbox_nums, image_id, num_id_to_cat_id_map):
    return det_res
-@jit
+def get_seg_res(masks, scores, labels, mask_nums, image_id,
-def get_seg_res(masks, mask_nums, image_id, num_id_to_cat_id_map):
+                label_to_cat_id_map):
+    import pycocotools.mask as mask_util
    seg_res = []
    k = 0
    for i in range(len(mask_nums)):
        cur_image_id = int(image_id[i][0])
        det_nums = mask_nums[i]
        for j in range(det_nums):
-            dt = masks[k]
+            mask = masks[k]
+            score = float(scores[k])
+            label = int(labels[k])
            k = k + 1
-            sg, num_id, score = dt.tolist()
+            cat_id = label_to_cat_id_map[label]
-            cat_id = num_id_to_cat_id_map[num_id]
+            rle = mask_util.encode(
+                np.array(
+                    mask[:, :, None], order="F", dtype="uint8"))[0]
            if six.PY3:
-                if 'counts' in sg:
+                if 'counts' in rle:
-                    sg['counts'] = sg['counts'].decode("utf8")
+                    rle['counts'] = rle['counts'].decode("utf8")
            sg_res = {
                'image_id': cur_image_id,
                'category_id': cat_id,
-                'segmentation': sg,
+                'segmentation': rle,
                'score': score
            }
            seg_res.append(sg_res)

--- a/dygraph/ppdet/py_op/target.py
+++ b/dygraph/ppdet/py_op/target.py
-import six
-import math
-import numpy as np
-from numba import jit
-from .bbox import *
-from .mask import *
-@jit
-def generate_rpn_anchor_target(anchors,
-                               gt_boxes,
-                               is_crowd,
-                               im_info,
-                               rpn_straddle_thresh,
-                               rpn_batch_size_per_im,
-                               rpn_positive_overlap,
-                               rpn_negative_overlap,
-                               rpn_fg_fraction,
-                               use_random=True,
-                               anchor_reg_weights=[1., 1., 1., 1.]):
-    anchor_num = anchors.shape[0]
-    batch_size = gt_boxes.shape[0]
-    loc_indexes = []
-    cls_indexes = []
-    tgt_labels = []
-    tgt_deltas = []
-    anchor_inside_weights = []
-    for i in range(batch_size):
-        # TODO: move anchor filter into anchor generator 
-        im_height = im_info[i][0]
-        im_width = im_info[i][1]
-        im_scale = im_info[i][2]
-        if rpn_straddle_thresh >= 0:
-            anchor_inds = np.where((anchors[:, 0] >= -rpn_straddle_thresh) & (
-                anchors[:, 1] >= -rpn_straddle_thresh) & (
-                    anchors[:, 2] < im_width + rpn_straddle_thresh) & (
-                        anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
-            anchor = anchors[anchor_inds, :]
-        else:
-            anchor_inds = np.arange(anchors.shape[0])
-            anchor = anchors
-        gt_bbox = gt_boxes[i] * im_scale
-        is_crowd_slice = is_crowd[i]
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_bbox = gt_bbox[not_crowd_inds]
-        # Step1: match anchor and gt_bbox
-        anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = label_anchor(anchor,
-                                                                       gt_bbox)
-        # Step2: sample anchor 
-        fg_inds, bg_inds, fg_fake_inds, fake_num = sample_anchor(
-            anchor_gt_bbox_iou, labels, rpn_positive_overlap,
-            rpn_negative_overlap, rpn_batch_size_per_im, rpn_fg_fraction,
-            use_random)
-        # Step3: make output  
-        loc_inds = np.hstack([fg_fake_inds, fg_inds])
-        cls_inds = np.hstack([fg_inds, bg_inds])
-        sampled_labels = labels[cls_inds]
-        sampled_anchors = anchor[loc_inds]
-        sampled_gt_boxes = gt_bbox[anchor_gt_bbox_inds[loc_inds]]
-        sampled_deltas = bbox2delta(sampled_anchors, sampled_gt_boxes,
-                                    anchor_reg_weights)
-        anchor_inside_weight = np.zeros((len(loc_inds), 4), dtype=np.float32)
-        anchor_inside_weight[fake_num:, :] = 1
-        loc_indexes.append(anchor_inds[loc_inds] + i * anchor_num)
-        cls_indexes.append(anchor_inds[cls_inds] + i * anchor_num)
-        tgt_labels.append(sampled_labels)
-        tgt_deltas.append(sampled_deltas)
-        anchor_inside_weights.append(anchor_inside_weight)
-    loc_indexes = np.concatenate(loc_indexes)
-    cls_indexes = np.concatenate(cls_indexes)
-    tgt_labels = np.concatenate(tgt_labels).astype('float32')
-    tgt_deltas = np.vstack(tgt_deltas).astype('float32')
-    anchor_inside_weights = np.vstack(anchor_inside_weights)
-    return loc_indexes, cls_indexes, tgt_labels, tgt_deltas, anchor_inside_weights
-@jit
-def label_anchor(anchors, gt_boxes):
-    iou = bbox_overlaps(anchors, gt_boxes)
-    # every gt's anchor's index
-    gt_bbox_anchor_inds = iou.argmax(axis=0)
-    gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])]
-    gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0]
-    # every anchor's gt bbox's index 
-    anchor_gt_bbox_inds = iou.argmax(axis=1)
-    anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds]
-    labels = np.ones((iou.shape[0], ), dtype=np.int32) * -1
-    labels[gt_bbox_anchor_iou_inds] = 1
-    return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels
-@jit
-def sample_anchor(anchor_gt_bbox_iou,
-                  labels,
-                  rpn_positive_overlap,
-                  rpn_negative_overlap,
-                  rpn_batch_size_per_im,
-                  rpn_fg_fraction,
-                  use_random=True):
-    labels[anchor_gt_bbox_iou >= rpn_positive_overlap] = 1
-    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(labels == 1)[0]
-    if len(fg_inds) > num_fg and use_random:
-        disable_inds = np.random.choice(
-            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-    else:
-        disable_inds = fg_inds[num_fg:]
-    labels[disable_inds] = -1
-    fg_inds = np.where(labels == 1)[0]
-    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
-    bg_inds = np.where(anchor_gt_bbox_iou < rpn_negative_overlap)[0]
-    if len(bg_inds) > num_bg and use_random:
-        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-    else:
-        enable_inds = bg_inds[:num_bg]
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    return fg_inds, bg_inds, fg_fake_inds, fake_num
-@jit
-def filter_roi(rois, max_overlap):
-    ws = rois[:, 2] - rois[:, 0] + 1
-    hs = rois[:, 3] - rois[:, 1] + 1
-    keep = np.where((ws > 0) & (hs > 0) & (max_overlap < 1))[0]
-    if len(keep) > 0:
-        return rois[keep, :]
-    return np.zeros((1, 4)).astype('float32')
-@jit
-def generate_proposal_target(rpn_rois,
-                             rpn_rois_num,
-                             gt_classes,
-                             is_crowd,
-                             gt_boxes,
-                             im_info,
-                             batch_size_per_im,
-                             fg_fraction,
-                             fg_thresh,
-                             bg_thresh_hi,
-                             bg_thresh_lo,
-                             bbox_reg_weights,
-                             class_nums=81,
-                             use_random=True,
-                             is_cls_agnostic=False,
-                             is_cascade_rcnn=False,
-                             max_overlaps=None):
-    rois = []
-    tgt_labels = []
-    tgt_deltas = []
-    rois_inside_weights = []
-    rois_outside_weights = []
-    sampled_max_overlaps = []
-    new_rois_num = []
-    st_num = 0
-    end_num = 0
-    for im_i in range(len(rpn_rois_num)):
-        length = rpn_rois_num[im_i]
-        end_num += length
-        rpn_roi = rpn_rois[st_num:end_num]
-        max_overlap = max_overlaps[st_num:end_num] if is_cascade_rcnn else None
-        im_scale = im_info[im_i][2]
-        rpn_roi = rpn_roi / im_scale
-        gt_bbox = gt_boxes[im_i]
-        if is_cascade_rcnn:
-            rpn_roi = filter_roi(rpn_roi, max_overlap)
-        bbox = np.vstack([gt_bbox, rpn_roi]).astype('float32')
-        # Step1: label bbox 
-        roi_gt_bbox_inds, labels, max_overlap = label_bbox(
-            bbox, gt_bbox, gt_classes[im_i], is_crowd[im_i])
-        # Step2: sample bbox 
-        fg_inds, bg_inds, fg_nums = sample_bbox(
-            max_overlap, batch_size_per_im, fg_fraction, fg_thresh,
-            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
-            use_random, is_cls_agnostic, is_cascade_rcnn)
-        # Step3: make output 
-        sampled_inds = np.append(fg_inds, bg_inds)
-        sampled_labels = labels[sampled_inds]
-        sampled_labels[fg_nums:] = 0
-        sampled_boxes = bbox[sampled_inds]
-        sampled_max_overlap = max_overlap[sampled_inds]
-        sampled_gt_boxes = gt_bbox[roi_gt_bbox_inds[sampled_inds]]
-        sampled_gt_boxes[fg_nums:, :] = 0
-        sampled_deltas = compute_bbox_targets(sampled_boxes, sampled_gt_boxes,
-                                              sampled_labels, bbox_reg_weights)
-        sampled_deltas[fg_nums:, :] = 0
-        sampled_deltas, bbox_inside_weights = expand_bbox_targets(
-            sampled_deltas, class_nums, is_cls_agnostic)
-        bbox_outside_weights = np.array(
-            bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
-        roi = sampled_boxes * im_scale
-        st_num += length
-        rois.append(roi)
-        new_rois_num.append(roi.shape[0])
-        tgt_labels.append(sampled_labels)
-        tgt_deltas.append(sampled_deltas)
-        rois_inside_weights.append(bbox_inside_weights)
-        rois_outside_weights.append(bbox_outside_weights)
-        sampled_max_overlaps.append(sampled_max_overlap)
-    rois = np.concatenate(rois, axis=0).astype(np.float32)
-    tgt_labels = np.concatenate(
-        tgt_labels, axis=0).astype(np.int32).reshape(-1, 1)
-    tgt_deltas = np.concatenate(tgt_deltas, axis=0).astype(np.float32)
-    rois_inside_weights = np.concatenate(
-        rois_inside_weights, axis=0).astype(np.float32)
-    rois_outside_weights = np.concatenate(
-        rois_outside_weights, axis=0).astype(np.float32)
-    sampled_max_overlaps = np.concatenate(
-        sampled_max_overlaps, axis=0).astype(np.float32)
-    new_rois_num = np.asarray(new_rois_num, np.int32)
-    return rois, tgt_labels, tgt_deltas, rois_inside_weights, rois_outside_weights, new_rois_num, sampled_max_overlaps
-@jit
-def label_bbox(boxes, gt_boxes, gt_classes, is_crowd, class_nums=81):
-    iou = bbox_overlaps(boxes, gt_boxes)
-    # every roi's gt box's index  
-    roi_gt_bbox_inds = np.zeros((boxes.shape[0]), dtype=np.int32)
-    roi_gt_bbox_iou = np.zeros((boxes.shape[0], class_nums), dtype=np.float32)
-    iou_argmax = iou.argmax(axis=1)
-    iou_max = iou.max(axis=1)
-    overlapped_boxes_ind = np.where(iou_max > 0)[0].astype('int32')
-    roi_gt_bbox_inds[overlapped_boxes_ind] = iou_argmax[overlapped_boxes_ind]
-    overlapped_boxes_gt_classes = gt_classes[iou_argmax[
-        overlapped_boxes_ind]].astype('int32')
-    roi_gt_bbox_iou[overlapped_boxes_ind,
-                    overlapped_boxes_gt_classes] = iou_max[overlapped_boxes_ind]
-    crowd_ind = np.where(is_crowd)[0]
-    roi_gt_bbox_iou[crowd_ind] = -1
-    max_overlap = roi_gt_bbox_iou.max(axis=1)
-    labels = roi_gt_bbox_iou.argmax(axis=1)
-    return roi_gt_bbox_inds, labels, max_overlap
-@jit
-def sample_bbox(max_overlap,
-                batch_size_per_im,
-                fg_fraction,
-                fg_thresh,
-                bg_thresh_hi,
-                bg_thresh_lo,
-                bbox_reg_weights,
-                class_nums,
-                use_random=True,
-                is_cls_agnostic=False,
-                is_cascade_rcnn=False):
-    rois_per_image = int(batch_size_per_im)
-    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
-    if is_cascade_rcnn:
-        fg_inds = np.where(max_overlap >= fg_thresh)[0]
-        bg_inds = np.where((max_overlap < bg_thresh_hi) & (max_overlap >=
-                                                           bg_thresh_lo))[0]
-        fg_nums = fg_inds.shape[0]
-        bg_nums = bg_inds.shape[0]
-    else:
-        # sampe fg 
-        fg_inds = np.where(max_overlap >= fg_thresh)[0]
-        fg_nums = np.minimum(fg_rois_per_im, fg_inds.shape[0])
-        if (fg_inds.shape[0] > fg_nums) and use_random:
-            fg_inds = np.random.choice(fg_inds, size=fg_nums, replace=False)
-        fg_inds = fg_inds[:fg_nums]
-        # sample bg 
-        bg_inds = np.where((max_overlap < bg_thresh_hi) & (max_overlap >=
-                                                           bg_thresh_lo))[0]
-        bg_nums = rois_per_image - fg_nums
-        bg_nums = np.minimum(bg_nums, bg_inds.shape[0])
-        if (bg_inds.shape[0] > bg_nums) and use_random:
-            bg_inds = np.random.choice(bg_inds, size=bg_nums, replace=False)
-        bg_inds = bg_inds[:bg_nums]
-    return fg_inds, bg_inds, fg_nums
-@jit
-def generate_mask_target(im_info, gt_classes, is_crowd, gt_segms, rois,
-                         rois_num, labels_int32, num_classes, resolution):
-    mask_rois = []
-    mask_rois_num = []
-    rois_has_mask_int32 = []
-    mask_int32 = []
-    st_num = 0
-    end_num = 0
-    for k in range(len(rois_num)):
-        length = rois_num[k]
-        end_num += length
-        # remove padding
-        gt_polys = gt_segms[k]
-        new_gt_polys = []
-        for i in range(gt_polys.shape[0]):
-            gt_segs = []
-            for j in range(gt_polys[i].shape[0]):
-                new_poly = []
-                polys = gt_polys[i][j]
-                for ii in range(polys.shape[0]):
-                    x, y = polys[ii]
-                    if (x == -1 and y == -1):
-                        continue
-                    elif (x >= 0 or y >= 0):
-                        new_poly.append([x, y])  # array, one poly 
-                if len(new_poly) > 0:
-                    gt_segs.append(new_poly)
-            new_gt_polys.append(gt_segs)
-        im_scale = im_info[k][2]
-        boxes = rois[st_num:end_num] / im_scale
-        bbox_fg, bbox_has_mask, masks = sample_mask(
-            boxes, new_gt_polys, labels_int32[st_num:end_num], gt_classes[k],
-            is_crowd[k], num_classes, resolution)
-        st_num += length
-        mask_rois.append(bbox_fg * im_scale)
-        mask_rois_num.append(len(bbox_fg))
-        rois_has_mask_int32.append(bbox_has_mask)
-        mask_int32.append(masks)
-    mask_rois = np.concatenate(mask_rois, axis=0).astype(np.float32)
-    mask_rois_num = np.array(mask_rois_num).astype(np.int32)
-    rois_has_mask_int32 = np.concatenate(
-        rois_has_mask_int32, axis=0).astype(np.int32)
-    mask_int32 = np.concatenate(mask_int32, axis=0).astype(np.int32)
-    return mask_rois, mask_rois_num, rois_has_mask_int32, mask_int32
-@jit
-def sample_mask(boxes, gt_polys, label_int32, gt_classes, is_crowd, num_classes,
-                resolution):
-    gt_polys_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
-    _gt_polys = [gt_polys[i] for i in gt_polys_inds]
-    boxes_from_polys = polys_to_boxes(_gt_polys)
-    fg_inds = np.where(label_int32 > 0)[0]
-    bbox_has_mask = fg_inds.copy()
-    if fg_inds.shape[0] > 0:
-        labels_fg = label_int32[fg_inds]
-        masks_fg = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
-        bbox_fg = boxes[fg_inds]
-        iou = bbox_overlaps_mask(bbox_fg, boxes_from_polys)
-        fg_polys_inds = np.argmax(iou, axis=1)
-        for i in range(bbox_fg.shape[0]):
-            poly_gt = _gt_polys[fg_polys_inds[i]]
-            roi_fg = bbox_fg[i]
-            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
-            mask = np.array(mask > 0, dtype=np.int32)
-            masks_fg[i, :] = np.reshape(mask, resolution**2)
-    else:
-        bg_inds = np.where(label_int32 == 0)[0]
-        bbox_fg = boxes[bg_inds[0]].reshape((1, -1))
-        masks_fg = -np.ones((1, resolution**2), dtype=np.int32)
-        labels_fg = np.zeros((1, ))
-        bbox_has_mask = np.append(bbox_has_mask, 0)
-    masks = expand_mask_targets(masks_fg, labels_fg, resolution, num_classes)
-    return bbox_fg, bbox_has_mask, masks
--- a/dygraph/ppdet/utils/check.py
+++ b/dygraph/ppdet/utils/check.py
@@ -90,11 +90,4 @@ def check_config(cfg):
    if 'log_iter' not in cfg:
        cfg.log_iter = 20
-    logger.debug("The 'num_classes'(number of classes) you set is {}, " \
-                "and 'with_background' in 'dataset' sets {}.\n" \
-                "So please note the actual number of categories is {}."
-                .format(cfg.num_classes, cfg.with_background,
-                        cfg.num_classes + 1))
-    cfg.num_classes = cfg.num_classes + int(cfg.with_background)
    return cfg