diff --git a/configs/vitdet/_base_/reader.yml b/configs/vitdet/_base_/faster_rcnn_reader.yml similarity index 100% rename from configs/vitdet/_base_/reader.yml rename to configs/vitdet/_base_/faster_rcnn_reader.yml diff --git a/configs/vitdet/_base_/mask_rcnn_reader.yml b/configs/vitdet/_base_/mask_rcnn_reader.yml new file mode 100644 index 0000000000000000000000000000000000000000..83fd376b730ed10767508d3541e778d9663f4555 --- /dev/null +++ b/configs/vitdet/_base_/mask_rcnn_reader.yml @@ -0,0 +1,41 @@ +worker_num: 2 +TrainReader: + sample_transforms: + - Decode: {} + # - RandomResizeCrop: {resizes: [400, 500, 600], cropsizes: [[384, 600], ], prob: 0.5} + - RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True} + - RandomFlip: {prob: 0.5} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: true + drop_last: true + collate_batch: false + use_shared_memory: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: false + drop_last: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadBatch: {pad_to_stride: 32} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/vitdet/_base_/optimizer_base_30e.yml b/configs/vitdet/_base_/optimizer_base_30e.yml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/configs/vitdet/_base_/ppyoloe_reader.yml b/configs/vitdet/_base_/ppyoloe_reader.yml new file mode 100644 index 0000000000000000000000000000000000000000..a4feaff4a1c1d64556bd787bd36b7ec7c6b08d81 --- /dev/null +++ b/configs/vitdet/_base_/ppyoloe_reader.yml @@ -0,0 +1,40 @@ +worker_num: 4 +eval_height: &eval_height 640 +eval_width: &eval_width 640 +eval_size: &eval_size [*eval_height, *eval_width] + +TrainReader: + sample_transforms: + - Decode: {} + - RandomDistort: {} + - RandomExpand: {fill_value: [123.675, 116.28, 103.53]} + - RandomCrop: {} + - RandomFlip: {} + batch_transforms: + - BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768], random_size: True, random_interp: True, keep_ratio: False} + - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} + - Permute: {} + - PadGT: {} + batch_size: 2 + shuffle: true + drop_last: true + use_shared_memory: true + collate_batch: true + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: *eval_size, keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} + - Permute: {} + batch_size: 2 + +TestReader: + inputs_def: + image_shape: [3, *eval_height, *eval_width] + sample_transforms: + - Decode: {} + - Resize: {target_size: *eval_size, keep_ratio: False, interp: 2} + - NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none} + - Permute: {} + batch_size: 1 diff --git a/configs/vitdet/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml b/configs/vitdet/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml index c27b4e6c417e4e3597a0733c83307b17bddc28e7..9ffbab297d14439712b105bd1b855315da467714 100644 --- a/configs/vitdet/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml +++ b/configs/vitdet/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml @@ -2,7 +2,7 @@ _BASE_: [ '../datasets/coco_detection.yml', '../runtime.yml', - './_base_/reader.yml', + './_base_/faster_rcnn_reader.yml', './_base_/optimizer_base_1x.yml' ] diff --git a/configs/vitdet/mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml b/configs/vitdet/mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..c11ce890d64a8709ba31df0a93d24494f7e3aa65 --- /dev/null +++ b/configs/vitdet/mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml @@ -0,0 +1,135 @@ +_BASE_: [ + '../datasets/coco_instance.yml', + '../runtime.yml', + './_base_/mask_rcnn_reader.yml', + './_base_/optimizer_base_1x.yml' +] + +weights: output/mask_rcnn_vit_base_hrfpn_cae_1x_coco/model_final + + +# runtime +log_iter: 100 +snapshot_epoch: 1 +norm_type: sync_bn +use_fused_allreduce_gradients: &use_checkpoint False + + +architecture: MaskRCNN +MaskRCNN: + backbone: VisionTransformer + neck: HRFPN + rpn_head: RPNHead + bbox_head: BBoxHead + mask_head: MaskHead + # post process + bbox_post_process: BBoxPostProcess + mask_post_process: MaskPostProcess + +VisionTransformer: + patch_size: 16 + embed_dim: 768 + depth: 12 + num_heads: 12 + mlp_ratio: 4 + qkv_bias: True + drop_rate: 0.0 + drop_path_rate: 0.2 + init_values: 0.1 + final_norm: False + use_rel_pos_bias: False + use_sincos_pos_emb: True + epsilon: 0.000001 # 1e-6 + out_indices: [3, 5, 7, 11] + with_fpn: True + use_checkpoint: *use_checkpoint + pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams + +HRFPN: + out_channel: 256 + use_bias: True + +RPNHead: + anchor_generator: + aspect_ratios: [0.5, 1.0, 2.0] + anchor_sizes: [[32], [64], [128], [256], [512]] + strides: [4, 8, 16, 32, 64] + rpn_target_assign: + batch_size_per_im: 256 + fg_fraction: 0.5 + negative_overlap: 0.3 + positive_overlap: 0.7 + use_random: True + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 1000 + topk_after_collect: True + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 1000 + post_nms_top_n: 1000 + loss_rpn_bbox: SmoothL1Loss + +SmoothL1Loss: + beta: 0.1111111111111111 + + +BBoxHead: + head: XConvNormHead + roi_extractor: + resolution: 7 + sampling_ratio: 0 + aligned: True + bbox_assigner: BBoxAssigner + loss_normalize_pos: True + bbox_loss: GIoULoss + +BBoxAssigner: + batch_size_per_im: 512 + bg_thresh: 0.5 + fg_thresh: 0.5 + fg_fraction: 0.25 + use_random: True + + +XConvNormHead: + num_convs: 4 + norm_type: bn + +GIoULoss: + loss_weight: 10. + reduction: 'none' + eps: 0.000001 + + + +BBoxPostProcess: + decode: RCNNBox + nms: + name: MultiClassNMS + keep_top_k: 100 + score_threshold: 0.05 + nms_threshold: 0.5 + +MaskHead: + head: MaskFeat + roi_extractor: + resolution: 14 + sampling_ratio: 0 + aligned: True + mask_assigner: MaskAssigner + share_bbox_feat: False + +MaskFeat: + num_convs: 4 + out_channel: 256 + norm_type: ~ + +MaskAssigner: + mask_resolution: 28 + +MaskPostProcess: + binary_thresh: 0.5 diff --git a/configs/vitdet/ppyoloe_vit_base_csppan_cae_30e_coco.yml b/configs/vitdet/ppyoloe_vit_base_csppan_cae_30e_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..d446d5cc9152d9de4a7fb36fb233680008eb6d03 --- /dev/null +++ b/configs/vitdet/ppyoloe_vit_base_csppan_cae_30e_coco.yml @@ -0,0 +1,78 @@ + +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + './_base_/ppyoloe_reader.yml', + './_base_/optimizer_base_30e.yml' +] + +weights: output/ppyoloe_vit_base_csppan_cae_30e_coco/model_final + + +snapshot_epoch: 2 +log_iter: 100 + + +use_ema: true +ema_decay: 0.9999 +ema_skip_names: ['yolo_head.proj_conv.weight', 'backbone.pos_embed'] +custom_black_list: ['reduce_mean'] +use_fused_allreduce_gradients: &use_checkpoint False + + +architecture: YOLOv3 +norm_type: sync_bn + +YOLOv3: + backbone: VisionTransformer + neck: YOLOCSPPAN + yolo_head: PPYOLOEHead + post_process: ~ + +VisionTransformer: + patch_size: 16 + embed_dim: 768 + depth: 12 + num_heads: 12 + mlp_ratio: 4 + qkv_bias: True + drop_rate: 0.0 + drop_path_rate: 0.2 + init_values: 0.1 + final_norm: False + use_rel_pos_bias: False + use_sincos_pos_emb: True + epsilon: 0.000001 # 1e-6 + out_indices: [11, ] + with_fpn: True + num_fpn_levels: 3 + out_with_norm: False + use_checkpoint: *use_checkpoint + pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams + +YOLOCSPPAN: + in_channels: [768, 768, 768] + act: 'silu' + +PPYOLOEHead: + fpn_strides: [8, 16, 32] + in_channels: [768, 768, 768] + static_assigner_epoch: -1 + grid_cell_scale: 5.0 + grid_cell_offset: 0.5 + use_varifocal_loss: True + loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5} + static_assigner: + name: ATSSAssigner + topk: 9 + assigner: + name: TaskAlignedAssigner + topk: 13 + alpha: 1.0 + beta: 6.0 + nms: + name: MultiClassNMS + nms_top_k: 1000 + keep_top_k: 300 + score_threshold: 0.01 + nms_threshold: 0.7 diff --git a/ppdet/modeling/backbones/vision_transformer.py b/ppdet/modeling/backbones/vision_transformer.py index e566e6a3b9e4fb5e1d526c85cc306bca4c9ed4ee..ef2914fadfe40e4a5246526e4de0bdd1d9f87d27 100644 --- a/ppdet/modeling/backbones/vision_transformer.py +++ b/ppdet/modeling/backbones/vision_transformer.py @@ -340,6 +340,7 @@ class VisionTransformer(nn.Layer): use_abs_pos_emb=False, use_sincos_pos_emb=True, with_fpn=True, + num_fpn_levels=4, use_checkpoint=False, **args): super().__init__() @@ -350,6 +351,8 @@ class VisionTransformer(nn.Layer): self.use_sincos_pos_emb = use_sincos_pos_emb self.use_rel_pos_bias = use_rel_pos_bias self.final_norm = final_norm + self.out_indices = out_indices + self.num_fpn_levels = num_fpn_levels if use_checkpoint: paddle.seed(0) @@ -415,14 +418,15 @@ class VisionTransformer(nn.Layer): assert len(out_indices) <= 4, '' self.out_indices = out_indices - self.out_channels = [embed_dim for _ in range(len(out_indices))] - self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ - 8 for _ in range(len(out_indices)) + self.out_channels = [embed_dim for _ in range(num_fpn_levels)] + self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [ + patch_size for _ in range(len(out_indices)) ] self.norm = Identity() if self.with_fpn: + assert num_fpn_levels <= 4, '' self.init_fpn( embed_dim=embed_dim, patch_size=patch_size, ) @@ -611,9 +615,15 @@ class VisionTransformer(nn.Layer): feats.append(xp) if self.with_fpn: - fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] - for i in range(len(feats)): - feats[i] = fpns[i](feats[i]) + fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][ + -self.num_fpn_levels:] + assert len(fpns) == len(feats) or len(feats) == 1, '' + outputs = [] + for i, m in enumerate(fpns): + outputs.append( + m(feats[i] if len(feats) == len(fpns) else feats[-1])) + + return outputs return feats