From dfc40ee0ae4794fe623eff850a5a8d7b8df609c9 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Wed, 14 Jul 2021 18:51:57 +0800 Subject: [PATCH] add DETR (#3690) --- configs/detr/_base_/detr_r50.yml | 44 ++++++++++++++ configs/detr/_base_/detr_reader.yml | 49 ++++++++++++++++ configs/detr/_base_/optimizer_1x.yml | 16 +++++ configs/detr/detr_r50_1x_coco.yml | 8 +++ ppdet/data/transform/batch_operators.py | 77 +++++++++++++++++++++++-- 5 files changed, 190 insertions(+), 4 deletions(-) create mode 100644 configs/detr/_base_/detr_r50.yml create mode 100644 configs/detr/_base_/detr_reader.yml create mode 100644 configs/detr/_base_/optimizer_1x.yml create mode 100644 configs/detr/detr_r50_1x_coco.yml diff --git a/configs/detr/_base_/detr_r50.yml b/configs/detr/_base_/detr_r50.yml new file mode 100644 index 000000000..d234fadd6 --- /dev/null +++ b/configs/detr/_base_/detr_r50.yml @@ -0,0 +1,44 @@ +architecture: DETR +pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ResNet50_cos_pretrained.pdparams +hidden_dim: 256 + + +DETR: + backbone: ResNet + transformer: DETRTransformer + detr_head: DETRHead + post_process: DETRBBoxPostProcess + + +ResNet: + # index 0 stands for res2 + depth: 50 + norm_type: bn + freeze_at: 0 + return_idx: [3] + lr_mult_list: [0.0, 0.1, 0.1, 0.1] + num_stages: 4 + + +DETRTransformer: + num_queries: 100 + position_embed_type: sine + nhead: 8 + num_encoder_layers: 6 + num_decoder_layers: 6 + dim_feedforward: 2048 + dropout: 0.1 + activation: relu + + +DETRHead: + num_mlp_layers: 3 + + +DETRLoss: + loss_coeff: {class: 1, bbox: 5, giou: 2, no_object: 0.1, mask: 1, dice: 1} + aux_loss: True + + +HungarianMatcher: + matcher_coeff: {class: 1, bbox: 5, giou: 2} diff --git a/configs/detr/_base_/detr_reader.yml b/configs/detr/_base_/detr_reader.yml new file mode 100644 index 000000000..5a7199296 --- /dev/null +++ b/configs/detr/_base_/detr_reader.yml @@ -0,0 +1,49 @@ +worker_num: 0 +TrainReader: + sample_transforms: + - Decode: {} + - RandomFlip: {prob: 0.5} + - RandomSelect: { transforms1: [ RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ], + transforms2: [ + RandomShortSideResize: { short_side_sizes: [ 400, 500, 600 ] }, + RandomSizeCrop: { min_size: 384, max_size: 600 }, + RandomShortSideResize: { short_side_sizes: [ 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800 ], max_size: 1333 } ] + } + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - NormalizeBox: {} + - BboxXYXY2XYWH: {} + - Permute: {} + batch_transforms: + - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true} + batch_size: 2 + shuffle: true + drop_last: true + collate_batch: false + use_shared_memory: false + + +EvalReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true} + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + + +TestReader: + sample_transforms: + - Decode: {} + - Resize: {target_size: [800, 1333], keep_ratio: True} + - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]} + - Permute: {} + batch_transforms: + - PadMaskBatch: {pad_to_stride: -1, return_pad_mask: true} + batch_size: 1 + shuffle: false + drop_last: false diff --git a/configs/detr/_base_/optimizer_1x.yml b/configs/detr/_base_/optimizer_1x.yml new file mode 100644 index 000000000..13528c5eb --- /dev/null +++ b/configs/detr/_base_/optimizer_1x.yml @@ -0,0 +1,16 @@ +epoch: 500 + +LearningRate: + base_lr: 0.0001 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [400] + use_warmup: false + +OptimizerBuilder: + clip_grad_by_norm: 0.1 + regularizer: false + optimizer: + type: AdamW + weight_decay: 0.0001 diff --git a/configs/detr/detr_r50_1x_coco.yml b/configs/detr/detr_r50_1x_coco.yml new file mode 100644 index 000000000..0df91bb56 --- /dev/null +++ b/configs/detr/detr_r50_1x_coco.yml @@ -0,0 +1,8 @@ +_BASE_: [ + '../datasets/coco_detection.yml', + '../runtime.yml', + '_base_/optimizer_1x.yml', + '_base_/detr_r50.yml', + '_base_/detr_reader.yml', +] +weights: output/detr_r50_1x_coco/model_final diff --git a/ppdet/data/transform/batch_operators.py b/ppdet/data/transform/batch_operators.py index ffe4fcc46..81b5ef728 100644 --- a/ppdet/data/transform/batch_operators.py +++ b/ppdet/data/transform/batch_operators.py @@ -33,7 +33,7 @@ logger = setup_logger(__name__) __all__ = [ 'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget', - 'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseRCNNTarget' + 'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseRCNNTarget', 'PadMaskBatch' ] @@ -764,10 +764,79 @@ class Gt2SparseRCNNTarget(BaseOperator): img_whwh = np.array([w, h, w, h], dtype=np.int32) sample["img_whwh"] = img_whwh if "scale_factor" in sample: - sample["scale_factor_wh"] = np.array([sample["scale_factor"][1], sample["scale_factor"][0]], - dtype=np.float32) + sample["scale_factor_wh"] = np.array( + [sample["scale_factor"][1], sample["scale_factor"][0]], + dtype=np.float32) sample.pop("scale_factor") else: - sample["scale_factor_wh"] = np.array([1.0, 1.0], dtype=np.float32) + sample["scale_factor_wh"] = np.array( + [1.0, 1.0], dtype=np.float32) + + return samples + + +@register_op +class PadMaskBatch(BaseOperator): + """ + Pad a batch of samples so they can be divisible by a stride. + The layout of each image should be 'CHW'. + Args: + pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure + height and width is divisible by `pad_to_stride`. + return_pad_mask (bool): If `return_pad_mask = True`, return + `pad_mask` for transformer. + """ + + def __init__(self, pad_to_stride=0, return_pad_mask=False): + super(PadMaskBatch, self).__init__() + self.pad_to_stride = pad_to_stride + self.return_pad_mask = return_pad_mask + + def __call__(self, samples, context=None): + """ + Args: + samples (list): a batch of sample, each is dict. + """ + coarsest_stride = self.pad_to_stride + + max_shape = np.array([data['image'].shape for data in samples]).max( + axis=0) + if coarsest_stride > 0: + max_shape[1] = int( + np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) + max_shape[2] = int( + np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) + + for data in samples: + im = data['image'] + im_c, im_h, im_w = im.shape[:] + padding_im = np.zeros( + (im_c, max_shape[1], max_shape[2]), dtype=np.float32) + padding_im[:, :im_h, :im_w] = im + data['image'] = padding_im + if 'semantic' in data and data['semantic'] is not None: + semantic = data['semantic'] + padding_sem = np.zeros( + (1, max_shape[1], max_shape[2]), dtype=np.float32) + padding_sem[:, :im_h, :im_w] = semantic + data['semantic'] = padding_sem + if 'gt_segm' in data and data['gt_segm'] is not None: + gt_segm = data['gt_segm'] + padding_segm = np.zeros( + (gt_segm.shape[0], max_shape[1], max_shape[2]), + dtype=np.uint8) + padding_segm[:, :im_h, :im_w] = gt_segm + data['gt_segm'] = padding_segm + if self.return_pad_mask: + padding_mask = np.zeros( + (max_shape[1], max_shape[2]), dtype=np.float32) + padding_mask[:im_h, :im_w] = 1. + data['pad_mask'] = padding_mask + + if 'gt_rbox2poly' in data and data['gt_rbox2poly'] is not None: + # ploy to rbox + polys = data['gt_rbox2poly'] + rbox = bbox_utils.poly2rbox(polys) + data['gt_rbox'] = rbox return samples -- GitLab