diff --git a/configs/vitdet/README.md b/configs/vitdet/README.md index f42e2473202c02e0798ec8286bbee210a9ab1373..e0037858544b671de34c79f32f43baa9525d9db4 100644 --- a/configs/vitdet/README.md +++ b/configs/vitdet/README.md @@ -13,11 +13,14 @@ non-trivial when new architectures, such as Vision Transformer (ViT) models, arr ## Model Zoo -| Backbone | Pretrained | Model | Scheduler | Images/GPU | Box AP | Config | Download | -|:------:|:--------:|:--------------:|:--------------:|:--------------:|:------:|:------:|:--------:| -| ViT-base | CAE | Cascade RCNN | 1x | 1 | 52.7 | [config](./cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.pdparams) | -| ViT-large | CAE | Cascade RCNN | 1x | 1 | 55.7 | [config](./cascade_rcnn_vit_large_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/cascade_rcnn_vit_large_hrfpn_cae_1x_coco.pdparams) | -| ViT-base | CAE | PP-YOLOE | 36e | 2 | 52.2 | [config](./ppyoloe_vit_base_csppan_cae_36e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_vit_base_csppan_cae_36e_coco.pdparams) | +| Model | Backbone | Pretrained | Scheduler | Images/GPU | Box AP | Mask AP | Config | Download | +|:------:|:--------:|:--------------:|:--------------:|:--------------:|:--------------:|:------:|:------:|:--------:| +| Cascade RCNN | ViT-base | CAE | 1x | 1 | 52.7 | - | [config](./cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.pdparams) | +| Cascade RCNN | ViT-large | CAE | 1x | 1 | 55.7 | - | [config](./cascade_rcnn_vit_large_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/cascade_rcnn_vit_large_hrfpn_cae_1x_coco.pdparams) | +| PP-YOLOE | ViT-base | CAE | 36e | 2 | 52.2 | - | [config](./ppyoloe_vit_base_csppan_cae_36e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_vit_base_csppan_cae_36e_coco.pdparams) | +| Mask RCNN | ViT-base | CAE | 1x | 1 | 50.6 | 44.9 | [config](./mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/mask_rcnn_vit_base_hrfpn_cae_1x_coco.pdparams) | +| Mask RCNN | ViT-large | CAE | 1x | 1 | 54.2 | 47.4 | [config](./mask_rcnn_vit_large_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/mask_rcnn_vit_large_hrfpn_cae_1x_coco.pdparams) | + **Notes:** - Model is trained on COCO train2017 dataset and evaluated on val2017 results of `mAP(IoU=0.5:0.95) diff --git a/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml b/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml index 6b805b7c74305338613191b28f9333d7aba39de0..8b693f687fd370231e2bdee47a8e7c719c4d63f2 100644 --- a/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml +++ b/configs/vitdet/faster_rcnn_vit_base_fpn_cae_1x_coco.yml @@ -2,7 +2,7 @@ _BASE_: [ '../datasets/coco_detection.yml', '../runtime.yml', - './_base_/reader.yml', + './_base_/faster_rcnn_reader.yml', './_base_/optimizer_base_1x.yml' ] @@ -81,15 +81,30 @@ RPNHead: nms_thresh: 0.7 pre_nms_top_n: 1000 post_nms_top_n: 1000 + loss_rpn_bbox: SmoothL1Loss + + +SmoothL1Loss: + beta: 0.1111111111111111 BBoxHead: - head: TwoFCHead + # head: TwoFCHead + head: XConvNormHead roi_extractor: resolution: 7 sampling_ratio: 0 aligned: True bbox_assigner: BBoxAssigner + loss_normalize_pos: True + bbox_loss: GIoULoss + + +GIoULoss: + loss_weight: 10. + reduction: 'none' + eps: 0.000001 # 1e-6 + BBoxAssigner: batch_size_per_im: 512 @@ -98,8 +113,13 @@ BBoxAssigner: fg_fraction: 0.25 use_random: True -TwoFCHead: - out_channel: 1024 +# TwoFCHead: +# out_channel: 1024 + +XConvNormHead: + num_convs: 4 + norm_type: bn + BBoxPostProcess: decode: RCNNBox diff --git a/configs/vitdet/mask_rcnn_vit_large_hrfpn_cae_1x_coco.yml b/configs/vitdet/mask_rcnn_vit_large_hrfpn_cae_1x_coco.yml new file mode 100644 index 0000000000000000000000000000000000000000..5884e91e9d146e6ec031e23b6840026e3c39b073 --- /dev/null +++ b/configs/vitdet/mask_rcnn_vit_large_hrfpn_cae_1x_coco.yml @@ -0,0 +1,29 @@ +_BASE_: [ + './mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml' +] + +weights: output/mask_rcnn_vit_large_hrfpn_cae_1x_coco/model_final + + +depth: &depth 24 +dim: &dim 1024 +use_fused_allreduce_gradients: &use_checkpoint True + +VisionTransformer: + img_size: [800, 1344] + embed_dim: *dim + depth: *depth + num_heads: 16 + drop_path_rate: 0.25 + out_indices: [7, 11, 15, 23] + use_checkpoint: *use_checkpoint + pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_large_cae_pretrained.pdparams + +HRFPN: + in_channels: [*dim, *dim, *dim, *dim] + +OptimizerBuilder: + optimizer: + layer_decay: 0.9 + weight_decay: 0.02 + num_layers: *depth diff --git a/ppdet/modeling/backbones/vision_transformer.py b/ppdet/modeling/backbones/vision_transformer.py index ef2914fadfe40e4a5246526e4de0bdd1d9f87d27..825724fa4b58319550a4f3e54c9a0d7d73183d3c 100644 --- a/ppdet/modeling/backbones/vision_transformer.py +++ b/ppdet/modeling/backbones/vision_transformer.py @@ -509,16 +509,24 @@ class VisionTransformer(nn.Layer): dim = x.shape[-1] # we add a small number to avoid floating point error in the interpolation # see discussion at https://github.com/facebookresearch/dino/issues/8 - w0, h0 = w0 + 0.1, h0 + 0.1 + # w0, h0 = w0 + 0.1, h0 + 0.1 + # patch_pos_embed = nn.functional.interpolate( + # patch_pos_embed.reshape([ + # 1, self.patch_embed.num_patches_w, + # self.patch_embed.num_patches_h, dim + # ]).transpose((0, 3, 1, 2)), + # scale_factor=(w0 / self.patch_embed.num_patches_w, + # h0 / self.patch_embed.num_patches_h), + # mode='bicubic', ) patch_pos_embed = nn.functional.interpolate( patch_pos_embed.reshape([ 1, self.patch_embed.num_patches_w, self.patch_embed.num_patches_h, dim ]).transpose((0, 3, 1, 2)), - scale_factor=(w0 / self.patch_embed.num_patches_w, - h0 / self.patch_embed.num_patches_h), + (w0, h0), mode='bicubic', ) + assert int(w0) == patch_pos_embed.shape[-2] and int( h0) == patch_pos_embed.shape[-1] patch_pos_embed = patch_pos_embed.transpose(