use_gpu: true log_iter: 50 save_dir: output snapshot_epoch: 1 weights: output/petr_resnet50_16x2_coco/model_final epoch: 100 num_joints: &num_joints 17 pixel_std: &pixel_std 200 metric: COCO num_classes: 1 trainsize: &trainsize 512 flip_perm: &flip_perm [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] find_unused_parameters: False #####model architecture: PETR pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/pretrained/PETR_pretrained.pdparams PETR: backbone: name: ResNet depth: 50 variant: b norm_type: bn freeze_norm: True freeze_at: 0 return_idx: [1,2,3] num_stages: 4 lr_mult_list: [0.1, 0.1, 0.1, 0.1] neck: name: ChannelMapper in_channels: [512, 1024, 2048] kernel_size: 1 out_channels: 256 norm_type: "gn" norm_groups: 32 act: None num_outs: 4 bbox_head: name: PETRHead num_query: 300 num_classes: 1 # only person in_channels: 2048 sync_cls_avg_factor: true with_kpt_refine: true transformer: name: PETRTransformer as_two_stage: true encoder: name: TransformerEncoder encoder_layer: name: TransformerEncoderLayer d_model: 256 attn: name: MSDeformableAttention embed_dim: 256 num_heads: 8 num_levels: 4 num_points: 4 dim_feedforward: 1024 dropout: 0.1 num_layers: 6 decoder: name: PETR_TransformerDecoder num_layers: 3 return_intermediate: true decoder_layer: name: PETR_TransformerDecoderLayer d_model: 256 dim_feedforward: 1024 dropout: 0.1 self_attn: name: MultiHeadAttention embed_dim: 256 num_heads: 8 dropout: 0.1 cross_attn: name: MultiScaleDeformablePoseAttention embed_dims: 256 num_heads: 8 num_levels: 4 num_points: 17 hm_encoder: name: TransformerEncoder encoder_layer: name: TransformerEncoderLayer d_model: 256 attn: name: MSDeformableAttention embed_dim: 256 num_heads: 8 num_levels: 1 num_points: 4 dim_feedforward: 1024 dropout: 0.1 num_layers: 1 refine_decoder: name: PETR_DeformableDetrTransformerDecoder num_layers: 2 return_intermediate: true decoder_layer: name: PETR_TransformerDecoderLayer d_model: 256 dim_feedforward: 1024 dropout: 0.1 self_attn: name: MultiHeadAttention embed_dim: 256 num_heads: 8 dropout: 0.1 cross_attn: name: MSDeformableAttention embed_dim: 256 num_levels: 4 positional_encoding: name: PositionEmbedding num_pos_feats: 128 normalize: true offset: -0.5 loss_cls: name: Weighted_FocalLoss use_sigmoid: true gamma: 2.0 alpha: 0.25 loss_weight: 2.0 reduction: "mean" loss_kpt: name: L1Loss loss_weight: 70.0 loss_kpt_rpn: name: L1Loss loss_weight: 70.0 loss_oks: name: OKSLoss loss_weight: 2.0 loss_hm: name: CenterFocalLoss loss_weight: 4.0 loss_kpt_refine: name: L1Loss loss_weight: 80.0 loss_oks_refine: name: OKSLoss loss_weight: 3.0 assigner: name: PoseHungarianAssigner cls_cost: name: FocalLossCost weight: 2.0 kpt_cost: name: KptL1Cost weight: 70.0 oks_cost: name: OksCost weight: 7.0 #####optimizer LearningRate: base_lr: 0.0002 schedulers: - !PiecewiseDecay milestones: [80] gamma: 0.1 use_warmup: false # - !LinearWarmup # start_factor: 0.001 # steps: 1000 OptimizerBuilder: clip_grad_by_norm: 0.1 optimizer: type: AdamW regularizer: factor: 0.0001 type: L2 #####data TrainDataset: !KeypointBottomUpCocoDataset image_dir: train2017 anno_path: annotations/person_keypoints_train2017.json dataset_dir: dataset/coco num_joints: *num_joints return_mask: false EvalDataset: !KeypointBottomUpCocoDataset image_dir: val2017 anno_path: annotations/person_keypoints_val2017.json dataset_dir: dataset/coco num_joints: *num_joints test_mode: true return_mask: false TestDataset: !ImageFolder anno_path: dataset/coco/keypoint_imagelist.txt worker_num: 2 global_mean: &global_mean [0.485, 0.456, 0.406] global_std: &global_std [0.229, 0.224, 0.225] TrainReader: sample_transforms: - Decode: {} - PhotoMetricDistortion: brightness_delta: 32 contrast_range: [0.5, 1.5] saturation_range: [0.5, 1.5] hue_delta: 18 - KeyPointFlip: flip_prob: 0.5 flip_permutation: *flip_perm - RandomAffine: max_degree: 30 scale: [1.0, 1.0] max_shift: 0. trainsize: -1 - RandomSelect: { transforms1: [ RandomShortSideRangeResize: { scales: [[400, 1400], [1400, 1400]]} ], transforms2: [ RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] }, RandomSizeCrop: { min_size: 384, max_size: 600}, RandomShortSideRangeResize: { scales: [[400, 1400], [1400, 1400]]} ]} batch_transforms: - NormalizeImage: {mean: *global_mean, std: *global_std, is_scale: True} - PadGT: {pad_img: True, minimum_gtnum: 1} - Permute: {} batch_size: 2 shuffle: true drop_last: true use_shared_memory: true collate_batch: true EvalReader: sample_transforms: - PETR_Resize: {img_scale: [[800, 1333]], keep_ratio: True} # - MultiscaleTestResize: {origin_target_size: [[800, 1333]], use_flip: false} - NormalizeImage: mean: *global_mean std: *global_std is_scale: true - Permute: {} batch_size: 1 TestReader: sample_transforms: - Decode: {} - EvalAffine: {size: 800} - NormalizeImage: mean: *global_mean std: *global_std is_scale: true - Permute: {} batch_size: 1