未验证 提交 4ac136a4 编写于 作者: W Wenyu 提交者: GitHub

[WIP] Add mask rcnn and yolo in vitdet (#7187)

* fix fpn

* add vitdet mask and yolo

* add vityolo
上级 995b5067
worker_num: 2
TrainReader:
sample_transforms:
- Decode: {}
# - RandomResizeCrop: {resizes: [400, 500, 600], cropsizes: [[384, 600], ], prob: 0.5}
- RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True}
- RandomFlip: {prob: 0.5}
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
- Permute: {}
batch_transforms:
- PadBatch: {pad_to_stride: 32}
batch_size: 1
shuffle: true
drop_last: true
collate_batch: false
use_shared_memory: true
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True}
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
- Permute: {}
batch_transforms:
- PadBatch: {pad_to_stride: 32}
batch_size: 1
shuffle: false
drop_last: false
TestReader:
sample_transforms:
- Decode: {}
- Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True}
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
- Permute: {}
batch_transforms:
- PadBatch: {pad_to_stride: 32}
batch_size: 1
shuffle: false
drop_last: false
worker_num: 4
eval_height: &eval_height 640
eval_width: &eval_width 640
eval_size: &eval_size [*eval_height, *eval_width]
TrainReader:
sample_transforms:
- Decode: {}
- RandomDistort: {}
- RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
- RandomCrop: {}
- RandomFlip: {}
batch_transforms:
- BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768], random_size: True, random_interp: True, keep_ratio: False}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
- PadGT: {}
batch_size: 2
shuffle: true
drop_last: true
use_shared_memory: true
collate_batch: true
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 2
TestReader:
inputs_def:
image_shape: [3, *eval_height, *eval_width]
sample_transforms:
- Decode: {}
- Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
_BASE_: [ _BASE_: [
'../datasets/coco_detection.yml', '../datasets/coco_detection.yml',
'../runtime.yml', '../runtime.yml',
'./_base_/reader.yml', './_base_/faster_rcnn_reader.yml',
'./_base_/optimizer_base_1x.yml' './_base_/optimizer_base_1x.yml'
] ]
......
_BASE_: [
'../datasets/coco_instance.yml',
'../runtime.yml',
'./_base_/mask_rcnn_reader.yml',
'./_base_/optimizer_base_1x.yml'
]
weights: output/mask_rcnn_vit_base_hrfpn_cae_1x_coco/model_final
# runtime
log_iter: 100
snapshot_epoch: 1
norm_type: sync_bn
use_fused_allreduce_gradients: &use_checkpoint False
architecture: MaskRCNN
MaskRCNN:
backbone: VisionTransformer
neck: HRFPN
rpn_head: RPNHead
bbox_head: BBoxHead
mask_head: MaskHead
# post process
bbox_post_process: BBoxPostProcess
mask_post_process: MaskPostProcess
VisionTransformer:
patch_size: 16
embed_dim: 768
depth: 12
num_heads: 12
mlp_ratio: 4
qkv_bias: True
drop_rate: 0.0
drop_path_rate: 0.2
init_values: 0.1
final_norm: False
use_rel_pos_bias: False
use_sincos_pos_emb: True
epsilon: 0.000001 # 1e-6
out_indices: [3, 5, 7, 11]
with_fpn: True
use_checkpoint: *use_checkpoint
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
HRFPN:
out_channel: 256
use_bias: True
RPNHead:
anchor_generator:
aspect_ratios: [0.5, 1.0, 2.0]
anchor_sizes: [[32], [64], [128], [256], [512]]
strides: [4, 8, 16, 32, 64]
rpn_target_assign:
batch_size_per_im: 256
fg_fraction: 0.5
negative_overlap: 0.3
positive_overlap: 0.7
use_random: True
train_proposal:
min_size: 0.0
nms_thresh: 0.7
pre_nms_top_n: 2000
post_nms_top_n: 1000
topk_after_collect: True
test_proposal:
min_size: 0.0
nms_thresh: 0.7
pre_nms_top_n: 1000
post_nms_top_n: 1000
loss_rpn_bbox: SmoothL1Loss
SmoothL1Loss:
beta: 0.1111111111111111
BBoxHead:
head: XConvNormHead
roi_extractor:
resolution: 7
sampling_ratio: 0
aligned: True
bbox_assigner: BBoxAssigner
loss_normalize_pos: True
bbox_loss: GIoULoss
BBoxAssigner:
batch_size_per_im: 512
bg_thresh: 0.5
fg_thresh: 0.5
fg_fraction: 0.25
use_random: True
XConvNormHead:
num_convs: 4
norm_type: bn
GIoULoss:
loss_weight: 10.
reduction: 'none'
eps: 0.000001
BBoxPostProcess:
decode: RCNNBox
nms:
name: MultiClassNMS
keep_top_k: 100
score_threshold: 0.05
nms_threshold: 0.5
MaskHead:
head: MaskFeat
roi_extractor:
resolution: 14
sampling_ratio: 0
aligned: True
mask_assigner: MaskAssigner
share_bbox_feat: False
MaskFeat:
num_convs: 4
out_channel: 256
norm_type: ~
MaskAssigner:
mask_resolution: 28
MaskPostProcess:
binary_thresh: 0.5
_BASE_: [
'../datasets/coco_detection.yml',
'../runtime.yml',
'./_base_/ppyoloe_reader.yml',
'./_base_/optimizer_base_30e.yml'
]
weights: output/ppyoloe_vit_base_csppan_cae_30e_coco/model_final
snapshot_epoch: 2
log_iter: 100
use_ema: true
ema_decay: 0.9999
ema_skip_names: ['yolo_head.proj_conv.weight', 'backbone.pos_embed']
custom_black_list: ['reduce_mean']
use_fused_allreduce_gradients: &use_checkpoint False
architecture: YOLOv3
norm_type: sync_bn
YOLOv3:
backbone: VisionTransformer
neck: YOLOCSPPAN
yolo_head: PPYOLOEHead
post_process: ~
VisionTransformer:
patch_size: 16
embed_dim: 768
depth: 12
num_heads: 12
mlp_ratio: 4
qkv_bias: True
drop_rate: 0.0
drop_path_rate: 0.2
init_values: 0.1
final_norm: False
use_rel_pos_bias: False
use_sincos_pos_emb: True
epsilon: 0.000001 # 1e-6
out_indices: [11, ]
with_fpn: True
num_fpn_levels: 3
out_with_norm: False
use_checkpoint: *use_checkpoint
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
YOLOCSPPAN:
in_channels: [768, 768, 768]
act: 'silu'
PPYOLOEHead:
fpn_strides: [8, 16, 32]
in_channels: [768, 768, 768]
static_assigner_epoch: -1
grid_cell_scale: 5.0
grid_cell_offset: 0.5
use_varifocal_loss: True
loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5}
static_assigner:
name: ATSSAssigner
topk: 9
assigner:
name: TaskAlignedAssigner
topk: 13
alpha: 1.0
beta: 6.0
nms:
name: MultiClassNMS
nms_top_k: 1000
keep_top_k: 300
score_threshold: 0.01
nms_threshold: 0.7
...@@ -340,6 +340,7 @@ class VisionTransformer(nn.Layer): ...@@ -340,6 +340,7 @@ class VisionTransformer(nn.Layer):
use_abs_pos_emb=False, use_abs_pos_emb=False,
use_sincos_pos_emb=True, use_sincos_pos_emb=True,
with_fpn=True, with_fpn=True,
num_fpn_levels=4,
use_checkpoint=False, use_checkpoint=False,
**args): **args):
super().__init__() super().__init__()
...@@ -350,6 +351,8 @@ class VisionTransformer(nn.Layer): ...@@ -350,6 +351,8 @@ class VisionTransformer(nn.Layer):
self.use_sincos_pos_emb = use_sincos_pos_emb self.use_sincos_pos_emb = use_sincos_pos_emb
self.use_rel_pos_bias = use_rel_pos_bias self.use_rel_pos_bias = use_rel_pos_bias
self.final_norm = final_norm self.final_norm = final_norm
self.out_indices = out_indices
self.num_fpn_levels = num_fpn_levels
if use_checkpoint: if use_checkpoint:
paddle.seed(0) paddle.seed(0)
...@@ -415,14 +418,15 @@ class VisionTransformer(nn.Layer): ...@@ -415,14 +418,15 @@ class VisionTransformer(nn.Layer):
assert len(out_indices) <= 4, '' assert len(out_indices) <= 4, ''
self.out_indices = out_indices self.out_indices = out_indices
self.out_channels = [embed_dim for _ in range(len(out_indices))] self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
8 for _ in range(len(out_indices)) patch_size for _ in range(len(out_indices))
] ]
self.norm = Identity() self.norm = Identity()
if self.with_fpn: if self.with_fpn:
assert num_fpn_levels <= 4, ''
self.init_fpn( self.init_fpn(
embed_dim=embed_dim, embed_dim=embed_dim,
patch_size=patch_size, ) patch_size=patch_size, )
...@@ -611,9 +615,15 @@ class VisionTransformer(nn.Layer): ...@@ -611,9 +615,15 @@ class VisionTransformer(nn.Layer):
feats.append(xp) feats.append(xp)
if self.with_fpn: if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
for i in range(len(feats)): -self.num_fpn_levels:]
feats[i] = fpns[i](feats[i]) assert len(fpns) == len(feats) or len(feats) == 1, ''
outputs = []
for i, m in enumerate(fpns):
outputs.append(
m(feats[i] if len(feats) == len(fpns) else feats[-1]))
return outputs
return feats return feats
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册