From aa9ff4381cf4bcab56c302f2902b67e8d9b0a015 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 5 Jun 2020 06:48:42 +0000 Subject: [PATCH] update yolov4 --- configs/yolov4/yolov4_cspdarknet_coco.yml | 74 ++-- configs/yolov4/yolov4_cspdarknet_voc.yml | 102 ++--- ppdet/data/reader.py | 24 ++ ppdet/data/transform/batch_operators.py | 3 +- ppdet/data/transform/operators.py | 480 +++++++++++++++++++++- ppdet/modeling/losses/iou_loss.py | 4 +- ppdet/modeling/losses/yolo_loss.py | 18 +- 7 files changed, 612 insertions(+), 93 deletions(-) diff --git a/configs/yolov4/yolov4_cspdarknet_coco.yml b/configs/yolov4/yolov4_cspdarknet_coco.yml index 8b4a15dc5..efbc33c52 100644 --- a/configs/yolov4/yolov4_cspdarknet_coco.yml +++ b/configs/yolov4/yolov4_cspdarknet_coco.yml @@ -25,7 +25,7 @@ YOLOv4Head: anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]] nms: background_label: -1 - keep_top_k: -1 + keep_top_k: 100 nms_threshold: 0.45 nms_top_k: -1 normalized: true @@ -40,21 +40,21 @@ YOLOv3Loss: # size here should be set as same value as TrainReader.batch_size batch_size: 8 ignore_thresh: 0.7 - label_smooth: true + label_smooth: false downsample: [8,16,32] scale_x_y: [1.2, 1.1, 1.05] iou_loss: IouLoss - match_score: true + ignore_class_score_thresh: 0.25 IouLoss: loss_weight: 0.07 max_height: 608 max_width: 608 ciou_term: true - loss_square: true + loss_square: false LearningRate: - base_lr: 0.0001 + base_lr: 0.0013 schedulers: - !PiecewiseDecay gamma: 0.1 @@ -77,8 +77,9 @@ OptimizerBuilder: _READER_: '../yolov3_reader.yml' TrainReader: inputs_def: - fields: ['image', 'gt_bbox', 'gt_class', 'gt_score', 'im_id'] - num_max_boxes: 50 + fields: ['image', 'gt_bbox', 'gt_class', 'gt_score'] + num_max_boxes: 90 + use_fine_grained_loss: true dataset: !COCODataSet image_dir: train2017 @@ -88,38 +89,43 @@ TrainReader: sample_transforms: - !DecodeImage to_rgb: True - - !ColorDistort {} - - !RandomExpand - fill_value: [123.675, 116.28, 103.53] - - !RandomCrop {} - - !RandomFlipImage - is_normalized: false + with_mosaic: True + - !MosaicImage + offset: 0.3 + mosaic_scale: [0.8, 1.0] + sample_scale: [0.3, 1.0] + sample_flip: 0.5 + use_cv2: true + interp: 2 - !NormalizeBox {} - !PadBox - num_max_boxes: 50 + num_max_boxes: 90 - !BboxXYXY2XYWH {} batch_transforms: - - !RandomShape - sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] - random_inter: True - - !NormalizeImage - mean: [0.,0.,0.] - std: [1.,1.,1.] - is_scale: True - is_channel_first: false - - !Permute - to_bgr: false - channel_first: True - # Gt2YoloTarget is only used when use_fine_grained_loss set as true, - # this operator will be deleted automatically if use_fine_grained_loss - # is set as false - - !Gt2YoloTarget - anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]] - anchors: [[12, 16], [19, 36], [40, 28], - [36, 75], [76, 55], [72, 146], - [142, 110], [192, 243], [459, 401]] - downsample_ratios: [8, 16, 32] + - !RandomShape + sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] + random_inter: True + - !NormalizeImage + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + is_scale: True + is_channel_first: false + - !Permute + to_bgr: false + channel_first: True + # Gt2YoloTarget is only used when use_fine_grained_loss set as true, + # this operator will be deleted automatically if use_fine_grained_loss + # is set as false + - !Gt2YoloTarget + anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + anchors: [[12, 16], [19, 36], [40, 28], + [36, 75], [76, 55], [72, 146], + [142, 110], [192, 243], [459, 401]] + downsample_ratios: [8, 16, 32] + iou_thresh: 0.213 batch_size: 8 + mosaic_prob: 0.3 + mosaic_epoch: 200 shuffle: true drop_last: true worker_num: 8 diff --git a/configs/yolov4/yolov4_cspdarknet_voc.yml b/configs/yolov4/yolov4_cspdarknet_voc.yml index beefaa0f1..af5cd6d39 100644 --- a/configs/yolov4/yolov4_cspdarknet_voc.yml +++ b/configs/yolov4/yolov4_cspdarknet_voc.yml @@ -1,11 +1,11 @@ architecture: YOLOv4 use_gpu: true -max_iters: 140000 +max_iters: 70000 log_smooth_window: 20 save_dir: output -snapshot_iter: 1000 +snapshot_iter: 2000 metric: VOC -pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/yolov4_cspdarknet.pdparams +pretrain_weights: https://paddlemodels.bj.bcebos.com/object_detection/CSPDarkNet53_pretrained.pdparams weights: output/yolov4_cspdarknet_voc/model_final num_classes: 20 use_fine_grained_loss: true @@ -38,29 +38,29 @@ YOLOv3Loss: # for training batch_size setting, training batch_size setting # is in configs/yolov3_reader.yml TrainReader.batch_size, batch # size here should be set as same value as TrainReader.batch_size - batch_size: 4 + batch_size: 8 ignore_thresh: 0.7 - label_smooth: true + label_smooth: false downsample: [8,16,32] scale_x_y: [1.2, 1.1, 1.05] iou_loss: IouLoss - match_score: true + ignore_class_score_thresh: 0.25 IouLoss: loss_weight: 0.07 max_height: 608 max_width: 608 ciou_term: true - loss_square: true + loss_square: false LearningRate: - base_lr: 0.0001 + base_lr: 0.0013 schedulers: - !PiecewiseDecay gamma: 0.1 milestones: - - 110000 - - 130000 + - 56000 + - 62000 - !LinearWarmup start_factor: 0. steps: 1000 @@ -77,8 +77,9 @@ OptimizerBuilder: _READER_: '../yolov3_reader.yml' TrainReader: inputs_def: - fields: ['image', 'gt_bbox', 'gt_class', 'gt_score', 'im_id'] - num_max_boxes: 50 + fields: ['image', 'gt_bbox', 'gt_class', 'gt_score'] + num_max_boxes: 90 + use_fine_grained_loss: true dataset: !VOCDataSet anno_path: trainval.txt @@ -87,38 +88,44 @@ TrainReader: sample_transforms: - !DecodeImage to_rgb: True - - !ColorDistort {} - - !RandomExpand - fill_value: [123.675, 116.28, 103.53] - - !RandomCrop {} - - !RandomFlipImage - is_normalized: false + with_mosaic: True + - !MosaicImage + offset: 0.3 + mosaic_scale: [0.8, 1.0] + sample_scale: [0.3, 1.0] + sample_flip: 0.5 + use_cv2: true + interp: 2 - !NormalizeBox {} - !PadBox - num_max_boxes: 50 + num_max_boxes: 90 - !BboxXYXY2XYWH {} batch_transforms: - - !RandomShape - sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] - random_inter: True - - !NormalizeImage - mean: [0.,0.,0.] - std: [1.,1.,1.] - is_scale: True - is_channel_first: false - - !Permute - to_bgr: false - channel_first: True - # Gt2YoloTarget is only used when use_fine_grained_loss set as true, - # this operator will be deleted automatically if use_fine_grained_loss - # is set as false - - !Gt2YoloTarget - anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]] - anchors: [[12, 16], [19, 36], [40, 28], - [36, 75], [76, 55], [72, 146], - [142, 110], [192, 243], [459, 401]] - downsample_ratios: [8, 16, 32] - batch_size: 4 + - !RandomShape + sizes: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] + random_inter: True + - !NormalizeImage + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + is_scale: True + is_channel_first: false + - !Permute + to_bgr: false + channel_first: True + # Gt2YoloTarget is only used when use_fine_grained_loss set as true, + # this operator will be deleted automatically if use_fine_grained_loss + # is set as false + - !Gt2YoloTarget + anchor_masks: [[0, 1, 2], [3, 4, 5], [6, 7, 8]] + anchors: [[12, 16], [19, 36], [40, 28], + [36, 75], [76, 55], [72, 146], + [142, 110], [192, 243], [459, 401]] + downsample_ratios: [8, 16, 32] + num_classes: 20 + iou_thresh: 0.213 + batch_size: 8 + mosaic_prob: 0.3 + mosaic_epoch: 300 shuffle: true drop_last: true worker_num: 8 @@ -141,10 +148,10 @@ EvalReader: to_rgb: True - !ResizeImage target_size: 608 - interp: 1 + interp: 2 - !NormalizeImage - mean: [0., 0., 0.] - std: [1., 1., 1.] + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] is_scale: True is_channel_first: false - !PadBox @@ -152,12 +159,15 @@ EvalReader: - !Permute to_bgr: false channel_first: True - batch_size: 4 + batch_size: 8 drop_empty: false worker_num: 8 bufsize: 16 TestReader: + inputs_def: + image_shape: [3, 608, 608] + fields: ['image', 'im_size', 'im_id'] dataset: !ImageFolder use_default_label: true @@ -169,8 +179,8 @@ TestReader: target_size: 608 interp: 1 - !NormalizeImage - mean: [0., 0., 0.] - std: [1., 1., 1.] + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] is_scale: True is_channel_first: false - !Permute diff --git a/ppdet/data/reader.py b/ppdet/data/reader.py index 5ed4a3f3d..61b5f1917 100644 --- a/ppdet/data/reader.py +++ b/ppdet/data/reader.py @@ -165,6 +165,7 @@ class Reader(object): drop_last (bool): whether drop last batch or not. Default False. drop_empty (bool): whether drop sample when it's gt is empty or not. Default True. + mosaic_epoch(int): mosaic epoc number mixup_epoch (int): mixup epoc number. Default is -1, meaning not use mixup. class_aware_sampling (bool): whether use class-aware sampling or not. @@ -190,6 +191,8 @@ class Reader(object): shuffle=False, drop_last=False, drop_empty=True, + mosaic_epoch=-1, + mosaic_prob=0.5, mixup_epoch=-1, class_aware_sampling=False, worker_num=-1, @@ -240,6 +243,8 @@ class Reader(object): self._drop_empty = drop_empty # sampling + self._mosaic_epoch = mosaic_epoch + self.mosaic_prob = mosaic_prob self._mixup_epoch = mixup_epoch self._class_aware_sampling = class_aware_sampling @@ -285,6 +290,11 @@ class Reader(object): if self._shuffle: np.random.shuffle(self.indexes) + if self._mosaic_epoch > 0 and len(self.indexes) < 4: + logger.info("Disable mosaic for dataset samples " + "less than 4 samples") + self.mosaic_epoch = -1 + if self._mixup_epoch > 0 and len(self.indexes) < 2: logger.debug("Disable mixup for dataset samples " "less than 2 samples") @@ -338,6 +348,20 @@ class Reader(object): if self._load_img: sample['image'] = self._load_image(sample['im_file']) + if np.random.uniform(0, 1) < self.mosaic_prob: + if self._epoch < self._mosaic_epoch: + num = len(self.indexes) + mosaic_idx = np.random.randint(1, num, size=3) + for i in range(len(mosaic_idx)): + mosaic_idx[i] = self.indexes[( + mosaic_idx[i] + self._pos - 1) % num] + mosaic_name = 'mosaic' + str(i) + sample[mosaic_name] = copy.deepcopy(self._roidbs[ + mosaic_idx[i]]) + if self._load_img: + sample[mosaic_name]['image'] = self._load_image( + sample[mosaic_name]['im_file']) + if self._epoch < self._mixup_epoch: num = len(self.indexes) mix_idx = np.random.randint(1, num) diff --git a/ppdet/data/transform/batch_operators.py b/ppdet/data/transform/batch_operators.py index 8da0b1e35..c44626af1 100644 --- a/ppdet/data/transform/batch_operators.py +++ b/ppdet/data/transform/batch_operators.py @@ -261,7 +261,8 @@ class Gt2YoloTarget(BaseOperator): iou = jaccard_overlap( [0., 0., gw, gh], [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]]) - if iou > self.iou_thresh: + if iou > self.iou_thresh and target[idx, 5, gj, + gi] == 0.: # x, y, w, h, scale target[idx, 0, gj, gi] = gx * grid_w - gi target[idx, 1, gj, gi] = gy * grid_h - gj diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py index 6d04454e5..1fa25e244 100644 --- a/ppdet/data/transform/operators.py +++ b/ppdet/data/transform/operators.py @@ -89,7 +89,7 @@ class BaseOperator(object): @register_op class DecodeImage(BaseOperator): - def __init__(self, to_rgb=True, with_mixup=False): + def __init__(self, to_rgb=True, with_mosaic=False, with_mixup=False): """ Transform the image data to numpy format. Args: @@ -99,9 +99,12 @@ class DecodeImage(BaseOperator): super(DecodeImage, self).__init__() self.to_rgb = to_rgb + self.with_mosaic = with_mosaic self.with_mixup = with_mixup if not isinstance(self.to_rgb, bool): raise TypeError("{}: input type is invalid.".format(self)) + if not isinstance(self.with_mosaic, bool): + raise TypeError("{}: input type is invalid.".format(self)) if not isinstance(self.with_mixup, bool): raise TypeError("{}: input type is invalid.".format(self)) @@ -139,7 +142,18 @@ class DecodeImage(BaseOperator): # make default im_info with [h, w, 1] sample['im_info'] = np.array( [im.shape[0], im.shape[1], 1.], dtype=np.float32) - # decode mixup image + + # decode mosaic + if self.with_mosaic and ('mosaic0' in sample or 'mosaic1' in sample or + 'mosaic2' in sample): + if 'mosaic0' in sample: + self.__call__(sample['mosaic0']) + if 'mosaic1' in sample: + self.__call__(sample['mosaic1']) + if 'mosaic2' in sample: + self.__call__(sample['mosaic2']) + + # decode mixup image if self.with_mixup and 'mixup' in sample: self.__call__(sample['mixup'], context) return sample @@ -1030,6 +1044,468 @@ class Permute(BaseOperator): return samples +@register_op +class MosaicImage(BaseOperator): + def __init__(self, + offset=0.2, + mosaic_prob=0.5, + mosaic_scale=[0.5, 2.0], + sample_scale=[0.5, 2.0], + sample_flip=0.5, + use_cv2=False, + interp=Image.BILINEAR): + super(MosaicImage, self).__init__() + self.offset = offset + self.mosaic_prob = mosaic_prob + self.mosaic_scale = mosaic_scale + self.sample_scale = sample_scale + self.sample_flip = sample_flip + self.use_cv2 = use_cv2 + self.interp = interp + self.crop = MosaicCrop() + if not (isinstance(self.mosaic_prob, float) and isinstance( + self.offset, float) and isinstance(self.mosaic_scale, list) and + isinstance(self.sample_scale, list) and + isinstance(self.sample_flip, float)): + raise TypeError("{}: input type is invalid.".format(self)) + + def _mosaic_img(self, img1, img2, img3, img4, h, w, cut_h, cut_w): + img_row1 = np.concatenate([img1, img2], axis=1) + img_row2 = np.concatenate([img3, img4], axis=1) + im = np.concatenate((img_row1, img_row2)) + + return im + + def _mosaic_gt_bbox(self, sample, cut_h, cut_w): + gt_bbox1 = sample['gt_bbox'] + gt_bbox2 = sample['mosaic0']['gt_bbox'] + gt_bbox3 = sample['mosaic1']['gt_bbox'] + gt_bbox4 = sample['mosaic2']['gt_bbox'] + + new_gt_bbox = [] + if len(gt_bbox1): + for box in gt_bbox1: + new_gt_bbox.append(box) + + if len(gt_bbox2): + for box in gt_bbox2: + box[0] += cut_w + box[2] += cut_w + new_gt_bbox.append(box) + + if len(gt_bbox3): + for box in gt_bbox3: + box[1] += cut_h + box[3] += cut_h + new_gt_bbox.append(box) + + if len(gt_bbox4): + for box in gt_bbox4: + box[0] += cut_w + box[1] += cut_h + box[2] += cut_w + box[3] += cut_h + new_gt_bbox.append(box) + + gt_bbox = np.array(new_gt_bbox) + + return gt_bbox + + def _mosaic_gt_score(self, sample): + + gt_score1 = sample['gt_score'] + gt_score2 = sample['mosaic0']['gt_score'] + gt_score3 = sample['mosaic1']['gt_score'] + gt_score4 = sample['mosaic2']['gt_score'] + new_gt_score = [] + if len(gt_score1): + for score in gt_score1: + new_gt_score.append(score) + + if len(gt_score2): + for score in gt_score2: + new_gt_score.append(score) + + if len(gt_score3): + for score in gt_score3: + new_gt_score.append(score) + + if len(gt_score4): + for score in gt_score4: + new_gt_score.append(score) + + gt_score = np.array(new_gt_score) + + return gt_score + + def _mosaic_gt_class(self, sample): + + gt_class1 = sample['gt_class'] + gt_class2 = sample['mosaic0']['gt_class'] + gt_class3 = sample['mosaic1']['gt_class'] + gt_class4 = sample['mosaic2']['gt_class'] + new_gt_class = [] + + if len(gt_class1): + for cla in gt_class1: + new_gt_class.append(cla) + + if len(gt_class2): + for cla in gt_class2: + new_gt_class.append(cla) + + if len(gt_class3): + for cla in gt_class3: + new_gt_class.append(cla) + + if len(gt_class4): + for cla in gt_class4: + new_gt_class.append(cla) + + gt_class = np.array(new_gt_class) + + return gt_class + + def _mosaic_is_crowd(self, sample): + is_crowd1 = sample['is_crowd'] + is_crowd2 = sample['mosaic0']['is_crowd'] + is_crowd3 = sample['mosaic1']['is_crowd'] + is_crowd4 = sample['mosaic2']['is_crowd'] + new_is_crowd = [] + + if len(is_crowd1): + for crowd in is_crowd1: + new_is_crowd.append(crowd) + + if len(is_crowd2): + for crowd in is_crowd2: + new_is_crowd.append(crowd) + + if len(is_crowd3): + for crowd in is_crowd3: + new_is_crowd.append(crowd) + + if len(is_crowd4): + for crowd in is_crowd4: + new_is_crowd.append(crowd) + + is_crowd = np.array(new_is_crowd) + + return is_crowd + + def draw_bbox(self, img, gt_bbox, c=255): + for bbox in gt_bbox: + x1, y1, h, w = bbox + cv2.rectangle(img, (x1, y1), (h, w), (0, 0, c), 2) + + return img + + def sample_scale_fun(self, sample, sample_scale, min_h, min_w): + h = sample['h'] + w = sample['w'] + new_scale = sample_scale[:] + scale_min = max(min_h / h, min_w / w) + if scale_min > new_scale[1]: + scale = round(scale_min + 0.05, 1) + else: + new_scale[0] = max(new_scale[0], scale_min) + scale = round(random.uniform(*new_scale) + 0.05, 1) + # scale = round(random.uniform(max(sample_scale[0], scale_min), sample_scale[1]), 1) + # int can not ensure new_h or new_w great than min_h or min_w + # new_h = int(sample['h'] * scale) + # new_w = int(sample['w'] * scale) + new_h = int(round(sample['h'] * scale + 0.5)) + new_w = int(round(sample['w'] * scale + 0.5)) + im = np.array(sample['image']) + if new_h < min_h or new_w < min_w: + print('!!scale error!!', scale, h, min_h, w, min_w) + + if self.use_cv2: + im = cv2.resize(im, (new_w, new_h), interpolation=self.interp) + else: + im = im.astype('uint8') + im = Image.fromarray(im) + im = im.resize((new_w, new_h), self.interp) + im = np.array(im) + sample['h'] = new_h + sample['w'] = new_w + sample['image'] = im + sample['gt_bbox'] = sample['gt_bbox'] * scale + return sample + + def sample_flip_fun(self, sample, flip_prob): + if random.uniform(0, 1) < flip_prob: + h = sample['h'] + w = sample['w'] + gt_bbox = sample['gt_bbox'] + if gt_bbox.shape == 0: + return sample + old_x1 = gt_bbox[:, 0].copy() + old_x2 = gt_bbox[:, 2].copy() + gt_bbox[:, 0] = np.round(np.clip(w - old_x2 - 1, 0, w - 1), 2) + gt_bbox[:, 2] = np.round(np.clip(w - old_x1 - 1, 0, w - 1), 2) + if gt_bbox.shape[0] != 0 and (gt_bbox[:, 2] < gt_bbox[:, 0]).all(): + m = "{}: invalid box, x2 should be greater than x1".format(self) + raise BboxError(m) + sample['gt_bbox'] = np.array(gt_bbox) + sample['image'] = sample['image'][:, ::-1, :] + + return sample + + def _org_img(self, sample): + img1 = sample['image'].copy() + gt1 = sample['gt_bbox'] + img1 = self.draw_bbox(img1, gt1) + img2 = sample['mosaic0']['image'].copy() + gt2 = sample['mosaic0']['gt_bbox'] + img2 = self.draw_bbox(img2, gt2) + img3 = sample['mosaic1']['image'].copy() + gt3 = sample['mosaic1']['gt_bbox'] + img3 = self.draw_bbox(img3, gt3) + img4 = sample['mosaic2']['image'].copy() + gt4 = sample['mosaic2']['gt_bbox'] + img4 = self.draw_bbox(img4, gt4) + + img1 = cv2.resize(img1, (200, 200)) + img2 = cv2.resize(img2, (200, 200)) + img3 = cv2.resize(img3, (200, 200)) + img4 = cv2.resize(img4, (200, 200)) + + img_row1 = np.concatenate([img1, img2], axis=1) + img_row2 = np.concatenate([img3, img4], axis=1) + img = np.concatenate((img_row1, img_row2)) + + return img + + def __call__(self, sample, context=None): + if 'mosaic0' not in sample: + sample = self.crop(sample, 0, 0) + if self.sample_flip: + sample = self.sample_flip_fun(sample, self.sample_flip) + return sample + h = sample['h'] + w = sample['w'] + if self.mosaic_scale[0]: + scale = round(random.uniform(*self.mosaic_scale), 1) + new_h = int(h * scale) + new_w = int(w * scale) + cut_h = np.random.randint(h * self.offset, h * (1 - self.offset)) + cut_w = np.random.randint(w * self.offset, w * (1 - self.offset)) + + # org_img = self._org_img(sample) + + if self.sample_scale[0]: + sample = self.sample_scale_fun(sample, self.sample_scale, cut_h, + cut_w) + sample['mosaic0'] = self.sample_scale_fun( + sample['mosaic0'], self.sample_scale, cut_h, new_w - cut_w) + sample['mosaic1'] = self.sample_scale_fun( + sample['mosaic1'], self.sample_scale, new_h - cut_h, cut_w) + sample['mosaic2'] = self.sample_scale_fun( + sample['mosaic2'], self.sample_scale, new_h - cut_h, + new_w - cut_w) + + if self.sample_flip: + sample = self.sample_flip_fun(sample, self.sample_flip) + sample['mosaic0'] = self.sample_flip_fun(sample['mosaic0'], + self.sample_flip) + sample['mosaic1'] = self.sample_flip_fun(sample['mosaic1'], + self.sample_flip) + sample['mosaic2'] = self.sample_flip_fun(sample['mosaic2'], + self.sample_flip) + + sample = self.crop(sample, width=cut_w, height=cut_h) + sample['mosaic0'] = self.crop( + sample['mosaic0'], width=new_w - cut_w, height=cut_h) + sample['mosaic1'] = self.crop( + sample['mosaic1'], width=cut_w, height=new_h - cut_h) + sample['mosaic2'] = self.crop( + sample['mosaic2'], width=new_w - cut_w, height=new_h - cut_h) + + img = self._mosaic_img(sample['image'], sample['mosaic0']['image'],\ + sample['mosaic1']['image'], sample['mosaic2']['image'], new_h, new_w, cut_h, cut_w) + gt_bbox = self._mosaic_gt_bbox(sample, cut_h, cut_w) + gt_score = self._mosaic_gt_score(sample) + gt_class = self._mosaic_gt_class(sample) + is_crowd = self._mosaic_is_crowd(sample) + + # image = self.draw_bbox(img, gt_bbox) + # image = cv2.resize(image, (400, 400)) + # image = np.concatenate([image, org_img], axis = 1) + # savename = '/mosaicbbox/' + sample['im_file'] + # cv2.imwrite(savename, image) + + sample['h'] = new_h + sample['w'] = new_w + sample['image'] = img + sample['gt_bbox'] = gt_bbox + sample['gt_class'] = gt_class + sample['gt_score'] = gt_score + sample['is_crowd'] = is_crowd + sample.pop('mosaic0') + sample.pop('mosaic1') + sample.pop('mosaic2') + + return sample + + +class MosaicCrop(object): + """Random crop image and bboxes. + + Args: + aspect_ratio (list): aspect ratio of cropped region. + in [min, max] format. + thresholds (list): iou thresholds for decide a valid bbox crop. + scaling (list): ratio between a cropped region and the original image. + in [min, max] format. + num_attempts (int): number of tries before giving up. + allow_no_crop (bool): allow return without actually cropping them. + cover_all_box (bool): ensure all bboxes are covered in the final crop. + """ + + def __init__(self, + aspect_ratio=[.5, 2.], + thresholds=[.0, .1, .3, .5, .7, .9], + scaling=[.3, 1.], + num_attempts=50, + allow_no_crop=True, + cover_all_box=False): + super(MosaicCrop, self).__init__() + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + + def __call__(self, sample, width=0, height=0, context=None): + if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: + if width: + sample['image'] = sample['image'][0:height, 0:width] + return sample + + h = sample['h'] + w = sample['w'] + gt_bbox = sample['gt_bbox'] + + # NOTE Original method attempts to generate one candidate for each + # threshold then randomly sample one from the resulting list. + # Here a short circuit approach is taken, i.e., randomly choose a + # threshold and attempt to find a valid crop, and simply return the + # first one found. + # The probability is not exactly the same, kinda resembling the + # "Monty Hall" problem. Actually carrying out the attempts will affect + # observability (just like opening doors in the "Monty Hall" game). + thresholds = list(self.thresholds) + if self.allow_no_crop and not width: + thresholds.append('no_crop') + np.random.shuffle(thresholds) + + for thresh in thresholds: + if thresh == 'no_crop': + return sample + + found = False + for i in range(self.num_attempts): + if width: + if w < width or h < height: + raise Exception('!!image size is not enough!!', w, + width, h, height) + if w == width: crop_x = 0 + else: crop_x = np.random.randint(0, w - width) + if h == height: crop_y = 0 + else: crop_y = np.random.randint(0, h - height) + crop_box = [crop_x, crop_y, crop_x + width, crop_y + height] + + else: + scale = np.random.uniform(*self.scaling) + min_ar, max_ar = self.aspect_ratio + aspect_ratio = np.random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + crop_h = int(h * scale / np.sqrt(aspect_ratio)) + crop_w = int(w * scale * np.sqrt(aspect_ratio)) + crop_y = np.random.randint(0, h - crop_h) + crop_x = np.random.randint(0, w - crop_w) + crop_box = [ + crop_x, crop_y, crop_x + crop_w, crop_y + crop_h + ] + iou = self._iou_matrix( + gt_bbox, np.array( + [crop_box], dtype=np.float32)) + if iou.max() < thresh: + continue + + if self.cover_all_box and iou.min() < thresh: + continue + + cropped_box, valid_ids = self._crop_box_with_center_constraint( + gt_bbox, np.array( + crop_box, dtype=np.float32)) + if valid_ids.size > 0: + found = True + break + + if found: + sample['image'] = self._crop_image(sample['image'], crop_box) + sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) + sample['gt_class'] = np.take( + sample['gt_class'], valid_ids, axis=0) + sample['w'] = crop_box[2] - crop_box[0] + sample['h'] = crop_box[3] - crop_box[1] + if 'gt_score' in sample: + sample['gt_score'] = np.take( + sample['gt_score'], valid_ids, axis=0) + if 'is_crowd' in sample: + sample['is_crowd'] = np.take( + sample['is_crowd'], valid_ids, axis=0) + return sample + + if width: + crop_box = [0, 0, width, height] + sample['image'] = self._crop_image(sample['image'], crop_box) + sample['gt_bbox'] = np.array([]) + sample['gt_class'] = np.array([]) + sample['w'] = crop_box[2] - crop_box[0] + sample['h'] = crop_box[3] - crop_box[1] + if 'gt_score' in sample: + sample['gt_score'] = np.array([]) + if 'is_crowd' in sample: + sample['is_crowd'] = np.array([]) + return sample + + return sample + + def _iou_matrix(self, a, b): + tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + area_o = (area_a[:, np.newaxis] + area_b - area_i) + return area_i / (area_o + 1e-10) + + def _crop_box_with_center_constraint(self, box, crop): + cropped_box = box.copy() + + cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) + cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) + cropped_box[:, :2] -= crop[:2] + cropped_box[:, 2:] -= crop[:2] + + centers = (box[:, :2] + box[:, 2:]) / 2 + valid = np.logical_and(crop[:2] <= centers, + centers < crop[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return cropped_box, np.where(valid)[0] + + def _crop_image(self, img, crop): + x1, y1, x2, y2 = crop + return img[y1:y2, x1:x2, :] + + @register_op class MixupImage(BaseOperator): def __init__(self, alpha=1.5, beta=1.5): diff --git a/ppdet/modeling/losses/iou_loss.py b/ppdet/modeling/losses/iou_loss.py index 5cd8aaccf..61ee495e0 100644 --- a/ppdet/modeling/losses/iou_loss.py +++ b/ppdet/modeling/losses/iou_loss.py @@ -115,8 +115,8 @@ class IouLoss(object): cx = (x1 + x2) / 2 cy = (y1 + y2) / 2 - w = (x2 - x1) + fluid.layers.cast((x2 - x1) == 0, 'float32') - h = (y2 - y1) + fluid.layers.cast((y2 - y1) == 0, 'float32') + w = x2 - x1 + h = (y2 - y1) + fluid.layers.cast((y2 - y1) == 0, 'float32') * eps cxg = (x1g + x2g) / 2 cyg = (y1g + y2g) / 2 diff --git a/ppdet/modeling/losses/yolo_loss.py b/ppdet/modeling/losses/yolo_loss.py index 732edb39d..66967f30b 100644 --- a/ppdet/modeling/losses/yolo_loss.py +++ b/ppdet/modeling/losses/yolo_loss.py @@ -50,7 +50,7 @@ class YOLOv3Loss(object): iou_aware_loss=None, downsample=[32, 16, 8], scale_x_y=1., - match_score=False): + ignore_class_score_thresh=-1.): self._batch_size = batch_size self._ignore_thresh = ignore_thresh self._label_smooth = label_smooth @@ -59,7 +59,7 @@ class YOLOv3Loss(object): self._iou_aware_loss = iou_aware_loss self.downsample = downsample self.scale_x_y = scale_x_y - self.match_score = match_score + self.ignore_class_score_thresh = ignore_class_score_thresh def __call__(self, outputs, gt_box, gt_label, gt_score, targets, anchors, anchor_masks, mask_anchors, num_classes, prefix_name): @@ -167,7 +167,7 @@ class YOLOv3Loss(object): self.scale_x_y, Sequence) else self.scale_x_y[i] loss_obj_pos, loss_obj_neg = self._calc_obj_loss( output, obj, tobj, gt_box, self._batch_size, anchors, - num_classes, downsample, self._ignore_thresh, scale_x_y) + num_classes, downsample, self._ignore_thresh, scale_x_y, cls) loss_cls = fluid.layers.sigmoid_cross_entropy_with_logits(cls, tcls) loss_cls = fluid.layers.elementwise_mul(loss_cls, tobj, axis=0) @@ -277,7 +277,7 @@ class YOLOv3Loss(object): return (tx, ty, tw, th, tscale, tobj, tcls) def _calc_obj_loss(self, output, obj, tobj, gt_box, batch_size, anchors, - num_classes, downsample, ignore_thresh, scale_x_y): + num_classes, downsample, ignore_thresh, scale_x_y, cls): # A prediction bbox overlap any gt_bbox over ignore_thresh, # objectness loss will be ignored, process as follows: @@ -329,14 +329,16 @@ class YOLOv3Loss(object): max_iou = fluid.layers.reduce_max(iou, dim=-1) iou_mask = fluid.layers.cast(max_iou <= ignore_thresh, dtype="float32") - if self.match_score: - max_prob = fluid.layers.reduce_max(prob, dim=-1) - iou_mask = iou_mask * fluid.layers.cast( - max_prob <= 0.25, dtype="float32") output_shape = fluid.layers.shape(output) an_num = len(anchors) // 2 iou_mask = fluid.layers.reshape(iou_mask, (-1, an_num, output_shape[2], output_shape[3])) + if self.ignore_class_score_thresh > 0.: + max_cls = fluid.layers.reduce_max(fluid.layers.sigmoid(cls), dim=-1) + iou_mask = fluid.layers.elementwise_max( + fluid.layers.cast( + max_cls <= self.ignore_class_score_thresh, dtype="float32"), + iou_mask) iou_mask.stop_gradient = True # NOTE: tobj holds gt_score, obj_mask holds object existence mask -- GitLab