diff --git a/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml b/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..6b805b7c74305338613191b28f9333d7aba39de0 --- /dev/null +++ b/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml @@ -0,0 +1,110 @@ + +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/reader.yml', + './_base_/optimizer_base_1x.yml' +] + +weights: output/faster_rcnn_vit_base_fpn_cae_1x_coco/model_final + + +# runtime +log_iter: 100 +snapshot_epoch: 1 +find_unused_parameters: True + +use_gpu: true +norm_type: sync_bn + +OptimizerBuilder: + optimizer: + weight_decay: 0.05 + +# reader +worker_num: 2 +TrainReader: + batch_size: 1 + + +# model +architecture: FasterRCNN + +FasterRCNN: + backbone: VisionTransformer + neck: FPN + rpn_head: RPNHead + bbox_head: BBoxHead + bbox_post_process: BBoxPostProcess + +VisionTransformer: + patch_size: 16 + embed_dim: 768 + depth: 12 + num_heads: 12 + mlp_ratio: 4 + qkv_bias: True + drop_rate: 0.0 + drop_path_rate: 0.2 + init_values: 0.1 + final_norm: False + use_rel_pos_bias: False + use_sincos_pos_emb: True + epsilon: 0.000001 # 1e-6 + out_indices: [3, 5, 7, 11] + with_fpn: True + pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams + + +FPN: + out_channel: 256 + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 1000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + + +BBoxHead: + head: TwoFCHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + +TwoFCHead: + out_channel: 1024 + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 diff --git a/ppdet/modeling/heads/bbox_head.py b/ppdet/modeling/heads/bbox_head.py index debd3074c2ad0ae05a26c9ef240d9b4a573846e6..e0041be371a23e31842165cd5e5a0e4d95265c8c 100644 --- a/ppdet/modeling/heads/bbox_head.py +++ b/ppdet/modeling/heads/bbox_head.py @@ -184,7 +184,8 @@ class BBoxHead(nn.Layer): with_pool=False, num_classes=80, bbox_weight=[10., 10., 5., 5.], - bbox_loss=None): + bbox_loss=None, + loss_normalize_pos=False): super(BBoxHead, self).__init__() self.head = head self.roi_extractor = roi_extractor @@ -196,6 +197,7 @@ class BBoxHead(nn.Layer): self.num_classes = num_classes self.bbox_weight = bbox_weight self.bbox_loss = bbox_loss + self.loss_normalize_pos = loss_normalize_pos self.bbox_score = nn.Linear( in_channel, @@ -250,8 +252,13 @@ class BBoxHead(nn.Layer): deltas = self.bbox_delta(feat) if self.training: - loss = self.get_loss(scores, deltas, targets, rois, - self.bbox_weight) + loss = self.get_loss( + scores, + deltas, + targets, + rois, + self.bbox_weight, + loss_normalize_pos=self.loss_normalize_pos) return loss, bbox_feat else: pred = self.get_prediction(scores, deltas) diff --git a/ppdet/modeling/heads/cascade_head.py b/ppdet/modeling/heads/cascade_head.py index 0498a35da5ce4952739245ba0426a1ac306bf2e3..9efc6492b6afe450f08e794dbe3f9c22d621cea7 100644 --- a/ppdet/modeling/heads/cascade_head.py +++ b/ppdet/modeling/heads/cascade_head.py @@ -250,7 +250,7 @@ class CascadeHead(BBoxHead): if self.training: deltas = deltas[paddle.arange(deltas.shape[0]), labels] else: - deltas = deltas[(deltas * F.one_hot( + deltas = deltas[((deltas + 10000) * F.one_hot( labels, num_classes=self.num_classes).unsqueeze(-1) != 0 ).nonzero(as_tuple=True)].reshape( [deltas.shape[0], 4])