diff --git a/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml b/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml
new file mode 100644
index 0000000000000000000000000000000000000000..6b805b7c74305338613191b28f9333d7aba39de0
--- /dev/null
+++ b/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml
@@ -0,0 +1,110 @@
+
+_BASE_: [
+  '../datasets/coco_detection.yml',
+  '../runtime.yml',
+  './_base_/reader.yml',
+  './_base_/optimizer_base_1x.yml'
+]
+
+weights: output/faster_rcnn_vit_base_fpn_cae_1x_coco/model_final
+
+
+# runtime
+log_iter: 100
+snapshot_epoch: 1
+find_unused_parameters: True
+
+use_gpu: true
+norm_type: sync_bn
+
+OptimizerBuilder:
+  optimizer:
+    weight_decay: 0.05
+
+# reader
+worker_num: 2
+TrainReader:
+  batch_size: 1
+
+
+# model
+architecture: FasterRCNN
+
+FasterRCNN:
+  backbone: VisionTransformer
+  neck: FPN
+  rpn_head: RPNHead
+  bbox_head: BBoxHead
+  bbox_post_process: BBoxPostProcess
+
+VisionTransformer:
+  patch_size: 16
+  embed_dim: 768
+  depth: 12
+  num_heads: 12
+  mlp_ratio: 4
+  qkv_bias: True
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+  init_values: 0.1
+  final_norm: False
+  use_rel_pos_bias: False
+  use_sincos_pos_emb: True
+  epsilon: 0.000001 # 1e-6
+  out_indices: [3, 5, 7, 11]
+  with_fpn: True
+  pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
+
+
+FPN:
+  out_channel: 256
+
+RPNHead:
+  anchor_generator:
+    aspect_ratios: [0.5, 1.0, 2.0]
+    anchor_sizes: [[32], [64], [128], [256], [512]]
+    strides: [4, 8, 16, 32, 64]
+  rpn_target_assign:
+    batch_size_per_im: 256
+    fg_fraction: 0.5
+    negative_overlap: 0.3
+    positive_overlap: 0.7
+    use_random: True
+  train_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 2000
+    post_nms_top_n: 1000
+    topk_after_collect: True
+  test_proposal:
+    min_size: 0.0
+    nms_thresh: 0.7
+    pre_nms_top_n: 1000
+    post_nms_top_n: 1000
+
+
+BBoxHead:
+  head: TwoFCHead
+  roi_extractor:
+    resolution: 7
+    sampling_ratio: 0
+    aligned: True
+  bbox_assigner: BBoxAssigner
+
+BBoxAssigner:
+  batch_size_per_im: 512
+  bg_thresh: 0.5
+  fg_thresh: 0.5
+  fg_fraction: 0.25
+  use_random: True
+
+TwoFCHead:
+  out_channel: 1024
+
+BBoxPostProcess:
+  decode: RCNNBox
+  nms:
+    name: MultiClassNMS
+    keep_top_k: 100
+    score_threshold: 0.05
+    nms_threshold: 0.5
diff --git a/ppdet/modeling/heads/bbox_head.py b/ppdet/modeling/heads/bbox_head.py
index debd3074c2ad0ae05a26c9ef240d9b4a573846e6..e0041be371a23e31842165cd5e5a0e4d95265c8c 100644
--- a/ppdet/modeling/heads/bbox_head.py
+++ b/ppdet/modeling/heads/bbox_head.py
@@ -184,7 +184,8 @@ class BBoxHead(nn.Layer):
                  with_pool=False,
                  num_classes=80,
                  bbox_weight=[10., 10., 5., 5.],
-                 bbox_loss=None):
+                 bbox_loss=None,
+                 loss_normalize_pos=False):
         super(BBoxHead, self).__init__()
         self.head = head
         self.roi_extractor = roi_extractor
@@ -196,6 +197,7 @@ class BBoxHead(nn.Layer):
         self.num_classes = num_classes
         self.bbox_weight = bbox_weight
         self.bbox_loss = bbox_loss
+        self.loss_normalize_pos = loss_normalize_pos
 
         self.bbox_score = nn.Linear(
             in_channel,
@@ -250,8 +252,13 @@ class BBoxHead(nn.Layer):
         deltas = self.bbox_delta(feat)
 
         if self.training:
-            loss = self.get_loss(scores, deltas, targets, rois,
-                                 self.bbox_weight)
+            loss = self.get_loss(
+                scores,
+                deltas,
+                targets,
+                rois,
+                self.bbox_weight,
+                loss_normalize_pos=self.loss_normalize_pos)
             return loss, bbox_feat
         else:
             pred = self.get_prediction(scores, deltas)
diff --git a/ppdet/modeling/heads/cascade_head.py b/ppdet/modeling/heads/cascade_head.py
index 0498a35da5ce4952739245ba0426a1ac306bf2e3..9efc6492b6afe450f08e794dbe3f9c22d621cea7 100644
--- a/ppdet/modeling/heads/cascade_head.py
+++ b/ppdet/modeling/heads/cascade_head.py
@@ -250,7 +250,7 @@ class CascadeHead(BBoxHead):
                 if self.training:
                     deltas = deltas[paddle.arange(deltas.shape[0]), labels]
                 else:
-                    deltas = deltas[(deltas * F.one_hot(
+                    deltas = deltas[((deltas + 10000) * F.one_hot(
                         labels, num_classes=self.num_classes).unsqueeze(-1) != 0
                                      ).nonzero(as_tuple=True)].reshape(
                                          [deltas.shape[0], 4])