diff --git a/configs/solov2/README.md b/configs/solov2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0dd5fd8eccbadd0d79c777e04193bcf28da104fe --- /dev/null +++ b/configs/solov2/README.md @@ -0,0 +1,22 @@ +# SOLOv2 (Segmenting Objects by Locations) for instance segmentation + +## Introduction + +- SOLOv2 is a fast instance segmentation framework with strong performance: [https://arxiv.org/abs/2003.10152](https://arxiv.org/abs/2003.10152) + +``` +@misc{wang2020solov2, + title={SOLOv2: Dynamic, Faster and Stronger}, + author={Xinlong Wang and Rufeng Zhang and Tao Kong and Lei Li and Chunhua Shen}, + year={2020}, + eprint={2003.10152}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +## Model Zoo + +| Backbone | Multi-scale training | Lr schd | Inf time (fps) | Mask AP | Download | Configs | +| :---------------------: | :-------------------: | :-----: | :------------: | :-----: | :---------: | :------------------------: | +| R50-FPN | False | 1x | - | 34.7 | [model](https://paddlemodels.bj.bcebos.com/object_detection/solov2_r50_fpn_1x.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/master/configs/solov2/solov2_r50_fpn_1x.yml) | diff --git a/configs/solov2/solov2_r50_fpn_1x.yml b/configs/solov2/solov2_r50_fpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..f7f26475a3b43803fded5c6d8304f74d35e26a30 --- /dev/null +++ b/configs/solov2/solov2_r50_fpn_1x.yml @@ -0,0 +1,62 @@ +architecture: SOLOv2 +use_gpu: true +max_iters: 90000 +snapshot_iter: 10000 +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/solov2_r50_fpn_1x/model_final +num_classes: 81 + +SOLOv2: + backbone: ResNet + fpn: FPN + bbox_head: SOLOv2Head + mask_head: SOLOv2MaskHead + batch_size: 2 + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + +FPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + reverse_out: True + +SOLOv2Head: + seg_feat_channels: 512 + stacked_convs: 4 + num_grids: [40, 36, 24, 16, 12] + kernel_out_channels: 256 + +SOLOv2MaskHead: + out_channels: 128 + start_level: 0 + end_level: 3 + num_classes: 256 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0. + steps: 1000 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +_READER_: 'solov2_reader.yml' diff --git a/configs/solov2/solov2_reader.yml b/configs/solov2/solov2_reader.yml new file mode 100644 index 0000000000000000000000000000000000000000..d8da6c4deafcdd74ad7b1946acef804bfb4c1bef --- /dev/null +++ b/configs/solov2/solov2_reader.yml @@ -0,0 +1,99 @@ +TrainReader: + batch_size: 2 + worker_num: 2 + inputs_def: + fields: ['image', 'im_id', 'gt_segm'] + dataset: + !COCODataSet + dataset_dir: dataset/coco + anno_path: annotations/instances_train2017.json + image_dir: train2017 + sample_transforms: + - !DecodeImage + to_rgb: true + - !Poly2Mask {} + - !ResizeImage + target_size: 800 + max_size: 1333 + interp: 1 + use_cv2: true + resize_box: true + - !RandomFlipImage + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !Permute + to_bgr: false + channel_first: true + batch_transforms: + - !PadBatch + pad_to_stride: 32 + - !Gt2Solov2Target + num_grids: [40, 36, 24, 16, 12] + scale_ranges: [[1, 96], [48, 192], [96, 384], [192, 768], [384, 2048]] + coord_sigma: 0.2 + shuffle: True + +EvalReader: + inputs_def: + fields: ['image', 'im_info', 'im_id'] + dataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + sample_transforms: + - !DecodeImage + to_rgb: true + - !ResizeImage + interp: 1 + max_size: 1333 + target_size: 800 + use_cv2: true + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: false + batch_size: 1 + shuffle: false + drop_last: false + drop_empty: false + worker_num: 2 + +TestReader: + inputs_def: + fields: ['image', 'im_info', 'im_id', 'im_shape'] + dataset: + !ImageFolder + anno_path: dataset/coco/annotations/instances_val2017.json + sample_transforms: + - !DecodeImage + to_rgb: true + - !ResizeImage + interp: 1 + max_size: 1333 + target_size: 800 + use_cv2: true + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: false diff --git a/deploy/python/infer.py b/deploy/python/infer.py index 57214f577e4fa9488e401eafacbcd8c4ffe4f624..249d11c2e653425bcaaf0dd1196db6cc90db03b2 100644 --- a/deploy/python/infer.py +++ b/deploy/python/infer.py @@ -30,6 +30,7 @@ RESIZE_SCALE_SET = { 'RCNN', 'RetinaNet', 'FCOS', + 'SOLOv2', } SUPPORT_MODELS = { @@ -41,6 +42,7 @@ SUPPORT_MODELS = { 'Face', 'TTF', 'FCOS', + 'SOLOv2', } @@ -85,7 +87,8 @@ class Resize(object): max_size, use_cv2=True, image_shape=None, - interp=cv2.INTER_LINEAR): + interp=cv2.INTER_LINEAR, + resize_box=False): self.target_size = target_size self.max_size = max_size self.image_shape = image_shape @@ -251,7 +254,7 @@ class PadStride(object): pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride) padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32) padding_im[:, :im_h, :im_w] = im - im_info['resize_shape'] = padding_im.shape[1:] + im_info['pad_shape'] = padding_im.shape[1:] return padding_im, im_info @@ -268,23 +271,29 @@ def create_inputs(im, im_info, model_arch='YOLO'): inputs['image'] = im origin_shape = list(im_info['origin_shape']) resize_shape = list(im_info['resize_shape']) + pad_shape = list(im_info['pad_shape']) if 'pad_shape' in im_info else list( + im_info['resize_shape']) scale_x, scale_y = im_info['scale'] if 'YOLO' in model_arch: im_size = np.array([origin_shape]).astype('int32') inputs['im_size'] = im_size - elif 'RetinaNet' or 'EfficientDet' in model_arch: + elif 'RetinaNet' in model_arch or 'EfficientDet' in model_arch: scale = scale_x - im_info = np.array([resize_shape + [scale]]).astype('float32') + im_info = np.array([pad_shape + [scale]]).astype('float32') inputs['im_info'] = im_info elif ('RCNN' in model_arch) or ('FCOS' in model_arch): scale = scale_x - im_info = np.array([resize_shape + [scale]]).astype('float32') + im_info = np.array([pad_shape + [scale]]).astype('float32') im_shape = np.array([origin_shape + [1.]]).astype('float32') inputs['im_info'] = im_info inputs['im_shape'] = im_shape elif 'TTF' in model_arch: scale_factor = np.array([scale_x, scale_y] * 2).astype('float32') inputs['scale_factor'] = scale_factor + elif 'SOLOv2' in model_arch: + scale = scale_x + im_info = np.array([resize_shape + [scale]]).astype('float32') + inputs['im_info'] = im_info return inputs @@ -405,10 +414,15 @@ def visualize(image_file, results, labels, mask_resolution=14, - output_dir='output/'): + output_dir='output/', + threshold=0.5): # visualize the predict result im = visualize_box_mask( - image_file, results, labels, mask_resolution=mask_resolution) + image_file, + results, + labels, + mask_resolution=mask_resolution, + threshold=threshold) img_name = os.path.split(image_file)[-1] if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -516,6 +530,11 @@ class Detector(): ms = (t2 - t1) * 1000.0 / repeats print("Inference: {} ms per batch image".format(ms)) + if self.config.arch == 'SOLOv2': + return dict( + segm=np.array(outs[2]), + label=np.array(outs[0]), + score=np.array(outs[1])) np_boxes = np.array(outs[0]) if self.config.mask_resolution is not None: np_masks = np.array(outs[1]) @@ -539,6 +558,13 @@ class Detector(): for i in range(repeats): self.predictor.zero_copy_run() output_names = self.predictor.get_output_names() + if self.config.arch == 'SOLOv2': + np_label = self.predictor.get_output_tensor(output_names[ + 0]).copy_to_cpu() + np_score = self.predictor.get_output_tensor(output_names[ + 1]).copy_to_cpu() + np_segms = self.predictor.get_output_tensor(output_names[ + 2]).copy_to_cpu() boxes_tensor = self.predictor.get_output_tensor(output_names[0]) np_boxes = boxes_tensor.copy_to_cpu() if self.config.mask_resolution is not None: @@ -552,6 +578,9 @@ class Detector(): # do not perform postprocess in benchmark mode results = [] if not run_benchmark: + if self.config.arch == 'SOLOv2': + return dict(segm=np_segms, label=np_label, score=np_score) + if reduce(lambda x, y: x * y, np_boxes.shape) < 6: print('[WARNNING] No object detected.') results = {'boxes': np.array([])} @@ -579,7 +608,8 @@ def predict_image(): results, detector.config.labels, mask_resolution=detector.config.mask_resolution, - output_dir=FLAGS.output_dir) + output_dir=FLAGS.output_dir, + threshold=FLAGS.threshold) def predict_video(camera_id): diff --git a/deploy/python/visualize.py b/deploy/python/visualize.py index e4d5f6ac7666e824f3278d1cf9b0a42835d78c8c..beab473d87b6e554a07da2d516c44bf622f13098 100644 --- a/deploy/python/visualize.py +++ b/deploy/python/visualize.py @@ -18,20 +18,22 @@ from __future__ import division import cv2 import numpy as np from PIL import Image, ImageDraw +from scipy import ndimage -def visualize_box_mask(im, results, labels, mask_resolution=14): - """ +def visualize_box_mask(im, results, labels, mask_resolution=14, threshold=0.5): + """ Args: im (str/np.ndarray): path of image/np.ndarray read by cv2 - results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box, + results (dict): include 'boxes': np.ndarray: shape:[N,6], N: number of box, matix element:[class, score, x_min, y_min, x_max, y_max] - MaskRCNN's results include 'masks': np.ndarray: - shape:[N, class_num, mask_resolution, mask_resolution] + MaskRCNN's results include 'masks': np.ndarray: + shape:[N, class_num, mask_resolution, mask_resolution] labels (list): labels:['class1', ..., 'classn'] mask_resolution (int): shape of a mask is:[mask_resolution, mask_resolution] + threshold (float): Threshold of score. Returns: - im (PIL.Image.Image): visualized image + im (PIL.Image.Image): visualized image """ if isinstance(im, str): im = Image.open(im).convert('RGB') @@ -46,15 +48,23 @@ def visualize_box_mask(im, results, labels, mask_resolution=14): resolution=mask_resolution) if 'boxes' in results: im = draw_box(im, results['boxes'], labels) + if 'segm' in results: + im = draw_segm( + im, + results['segm'], + results['label'], + results['score'], + labels, + threshold=threshold) return im def get_color_map_list(num_classes): - """ + """ Args: num_classes (int): number of class Returns: - color_map (list): RGB color list + color_map (list): RGB color list """ color_map = num_classes * [0, 0, 0] for i in range(0, num_classes): @@ -71,9 +81,9 @@ def get_color_map_list(num_classes): def expand_boxes(boxes, scale=0.0): - """ + """ Args: - boxes (np.ndarray): shape:[N,4], N:number of box, + boxes (np.ndarray): shape:[N,4], N:number of box, matix element:[x_min, y_min, x_max, y_max] scale (float): scale of boxes Returns: @@ -94,17 +104,17 @@ def expand_boxes(boxes, scale=0.0): def draw_mask(im, np_boxes, np_masks, labels, resolution=14, threshold=0.5): - """ + """ Args: im (PIL.Image.Image): PIL image - np_boxes (np.ndarray): shape:[N,6], N: number of box, + np_boxes (np.ndarray): shape:[N,6], N: number of box, matix element:[class, score, x_min, y_min, x_max, y_max] np_masks (np.ndarray): shape:[N, class_num, resolution, resolution] labels (list): labels:['class1', ..., 'classn'] resolution (int): shape of a mask is:[resolution, resolution] threshold (float): threshold of mask Returns: - im (PIL.Image.Image): visualized image + im (PIL.Image.Image): visualized image """ color_list = get_color_map_list(len(labels)) scale = (resolution + 2.0) / resolution @@ -149,14 +159,14 @@ def draw_mask(im, np_boxes, np_masks, labels, resolution=14, threshold=0.5): def draw_box(im, np_boxes, labels): - """ + """ Args: im (PIL.Image.Image): PIL image - np_boxes (np.ndarray): shape:[N,6], N: number of box, + np_boxes (np.ndarray): shape:[N,6], N: number of box, matix element:[class, score, x_min, y_min, x_max, y_max] labels (list): labels:['class1', ..., 'classn'] Returns: - im (PIL.Image.Image): visualized image + im (PIL.Image.Image): visualized image """ draw_thickness = min(im.size) // 320 draw = ImageDraw.Draw(im) @@ -186,3 +196,43 @@ def draw_box(im, np_boxes, labels): [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) return im + + +def draw_segm(im, + np_segms, + np_label, + np_score, + labels, + threshold=0.5, + alpha=0.7): + """ + Draw segmentation on image + """ + mask_color_id = 0 + w_ratio = .4 + color_list = get_color_map_list(len(labels)) + im = np.array(im).astype('float32') + clsid2color = {} + np_segms = np_segms.astype(np.uint8) + for i in range(np_segms.shape[0]): + mask, score, clsid = np_segms[i], np_score[i], np_label[i] + 1 + if score < threshold: + continue + + if clsid not in clsid2color: + clsid2color[clsid] = color_list[clsid] + color_mask = clsid2color[clsid] + for c in range(3): + color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 + idx = np.nonzero(mask) + color_mask = np.array(color_mask) + im[idx[0], idx[1], :] *= 1.0 - alpha + im[idx[0], idx[1], :] += alpha * color_mask + center_y, center_x = ndimage.measurements.center_of_mass(mask) + label_text = "{}".format(labels[clsid]) + print(label_text) + print(center_y, center_x) + vis_pos = (max(int(center_x) - 10, 0), int(center_y)) + cv2.putText(im, label_text, vis_pos, cv2.FONT_HERSHEY_COMPLEX, 0.3, + (255, 255, 255)) + return Image.fromarray(im.astype('uint8')) diff --git a/ppdet/data/transform/batch_operators.py b/ppdet/data/transform/batch_operators.py index 331752d779e974c9167b365f4148a3038c727de1..31b2c10a49ca69a377aa373161ed673d12156f2b 100644 --- a/ppdet/data/transform/batch_operators.py +++ b/ppdet/data/transform/batch_operators.py @@ -24,6 +24,7 @@ except Exception: import logging import cv2 import numpy as np +from scipy import ndimage from .operators import register_op, BaseOperator from .op_helper import jaccard_overlap, gaussian2D @@ -37,6 +38,7 @@ __all__ = [ 'Gt2YoloTarget', 'Gt2FCOSTarget', 'Gt2TTFTarget', + 'Gt2Solov2Target', ] @@ -88,6 +90,13 @@ class PadBatch(BaseOperator): (1, max_shape[1], max_shape[2]), dtype=np.float32) padding_sem[:, :im_h, :im_w] = semantic data['semantic'] = padding_sem + if 'gt_segm' in data.keys() and data['gt_segm'] is not None: + gt_segm = data['gt_segm'] + padding_segm = np.zeros( + (gt_segm.shape[0], max_shape[1], max_shape[2]), + dtype=np.uint8) + padding_segm[:, :im_h, :im_w] = gt_segm + data['gt_segm'] = padding_segm return samples @@ -590,3 +599,154 @@ class Gt2TTFTarget(BaseOperator): heatmap[y - top:y + bottom, x - left:x + right] = np.maximum( masked_heatmap, masked_gaussian) return heatmap + + +@register_op +class Gt2Solov2Target(BaseOperator): + """Assign mask target and labels in SOLOv2 network. + Args: + num_grids (list): The list of feature map grids size. + scale_ranges (list): The list of mask boundary range. + coord_sigma (float): The coefficient of coordinate area length. + sampling_ratio (float): The ratio of down sampling. + """ + + def __init__(self, + num_grids=[40, 36, 24, 16, 12], + scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768], + [384, 2048]], + coord_sigma=0.2, + sampling_ratio=4.0): + super(Gt2Solov2Target, self).__init__() + self.num_grids = num_grids + self.scale_ranges = scale_ranges + self.coord_sigma = coord_sigma + self.sampling_ratio = sampling_ratio + + def _scale_size(self, im, scale): + h, w = im.shape[:2] + new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)) + resized_img = cv2.resize( + im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) + return resized_img + + def __call__(self, samples, context=None): + for sample in samples: + gt_bboxes_raw = sample['gt_bbox'] + gt_labels_raw = sample['gt_class'] + im_c, im_h, im_w = sample['image'].shape[:] + gt_masks_raw = sample['gt_segm'].astype(np.uint8) + mask_feat_size = [ + int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio) + ] + gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) * + (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1])) + ins_ind_label_list = [] + grid_offset = [] + idx = 0 + for (lower_bound, upper_bound), num_grid \ + in zip(self.scale_ranges, self.num_grids): + + hit_indices = ((gt_areas >= lower_bound) & + (gt_areas <= upper_bound)).nonzero()[0] + num_ins = len(hit_indices) + + ins_label = [] + grid_order = [] + cate_label = np.zeros([num_grid, num_grid], dtype=np.int64) + ins_ind_label = np.zeros([num_grid**2], dtype=np.bool) + + if num_ins == 0: + ins_label = np.zeros( + [1, mask_feat_size[0], mask_feat_size[1]], + dtype=np.uint8) + ins_ind_label_list.append(ins_ind_label) + sample['cate_label{}'.format(idx)] = cate_label.flatten() + sample['ins_label{}'.format(idx)] = ins_label + sample['grid_order{}'.format(idx)] = np.asarray([0]) + grid_offset.append(1) + idx += 1 + continue + gt_bboxes = gt_bboxes_raw[hit_indices] + gt_labels = gt_labels_raw[hit_indices] + gt_masks = gt_masks_raw[hit_indices, ...] + + half_ws = 0.5 * ( + gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma + half_hs = 0.5 * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma + + for seg_mask, gt_label, half_h, half_w in zip( + gt_masks, gt_labels, half_hs, half_ws): + if seg_mask.sum() == 0: + continue + # mass center + upsampled_size = (mask_feat_size[0] * 4, + mask_feat_size[1] * 4) + center_h, center_w = ndimage.measurements.center_of_mass( + seg_mask) + coord_w = int( + (center_w / upsampled_size[1]) // (1. / num_grid)) + coord_h = int( + (center_h / upsampled_size[0]) // (1. / num_grid)) + + # left, top, right, down + top_box = max(0, + int(((center_h - half_h) / upsampled_size[0]) + // (1. / num_grid))) + down_box = min(num_grid - 1, + int(((center_h + half_h) / upsampled_size[0]) + // (1. / num_grid))) + left_box = max(0, + int(((center_w - half_w) / upsampled_size[1]) + // (1. / num_grid))) + right_box = min(num_grid - 1, + int(((center_w + half_w) / + upsampled_size[1]) // (1. / num_grid))) + + top = max(top_box, coord_h - 1) + down = min(down_box, coord_h + 1) + left = max(coord_w - 1, left_box) + right = min(right_box, coord_w + 1) + + cate_label[top:(down + 1), left:(right + 1)] = gt_label + seg_mask = self._scale_size( + seg_mask, scale=1. / self.sampling_ratio) + for i in range(top, down + 1): + for j in range(left, right + 1): + label = int(i * num_grid + j) + cur_ins_label = np.zeros( + [mask_feat_size[0], mask_feat_size[1]], + dtype=np.uint8) + cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[ + 1]] = seg_mask + ins_label.append(cur_ins_label) + ins_ind_label[label] = True + grid_order.append(label) + if ins_label == []: + ins_label = np.zeros( + [1, mask_feat_size[0], mask_feat_size[1]], + dtype=np.uint8) + ins_ind_label_list.append(ins_ind_label) + sample['cate_label{}'.format(idx)] = cate_label.flatten() + sample['ins_label{}'.format(idx)] = ins_label + sample['grid_order{}'.format(idx)] = np.asarray([0]) + grid_offset.append(1) + else: + ins_label = np.stack(ins_label, axis=0) + ins_ind_label_list.append(ins_ind_label) + sample['cate_label{}'.format(idx)] = cate_label.flatten() + sample['ins_label{}'.format(idx)] = ins_label + sample['grid_order{}'.format(idx)] = np.asarray(grid_order) + assert len(grid_order) > 0 + grid_offset.append(len(grid_order)) + idx += 1 + ins_ind_labels = np.concatenate([ + ins_ind_labels_level_img + for ins_ind_labels_level_img in ins_ind_label_list + ]) + fg_num = np.sum(ins_ind_labels) + sample['fg_num'] = fg_num + sample['grid_offset'] = np.asarray(grid_offset).astype(np.int32) + + return samples diff --git a/ppdet/data/transform/operators.py b/ppdet/data/transform/operators.py index b49111933fa7d4513dd0c91f3de61fd228b8115a..c32f9fb9f4d3b2c4dad09b3de2e33f7ff82ed017 100644 --- a/ppdet/data/transform/operators.py +++ b/ppdet/data/transform/operators.py @@ -272,7 +272,8 @@ class ResizeImage(BaseOperator): target_size=0, max_size=0, interp=cv2.INTER_LINEAR, - use_cv2=True): + use_cv2=True, + resize_box=False): """ Rescale image to the specified target size, and capped at max_size if max_size != 0. @@ -285,11 +286,13 @@ class ResizeImage(BaseOperator): interp (int): the interpolation method use_cv2 (bool): use the cv2 interpolation method or use PIL interpolation method + resize_box (bool): whether resize ground truth bbox annotations. """ super(ResizeImage, self).__init__() self.max_size = int(max_size) self.interp = int(interp) self.use_cv2 = use_cv2 + self.resize_box = resize_box if not (isinstance(target_size, int) or isinstance(target_size, list)): raise TypeError( "Type of target_size is invalid. Must be Integer or List, now is {}". @@ -348,18 +351,6 @@ class ResizeImage(BaseOperator): fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) - if 'semantic' in sample.keys() and sample['semantic'] is not None: - semantic = sample['semantic'] - semantic = cv2.resize( - semantic.astype('float32'), - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - semantic = np.asarray(semantic).astype('int32') - semantic = np.expand_dims(semantic, 0) - sample['semantic'] = semantic else: if self.max_size != 0: raise TypeError( @@ -370,6 +361,38 @@ class ResizeImage(BaseOperator): im = im.resize((int(resize_w), int(resize_h)), self.interp) im = np.array(im) sample['image'] = im + sample['scale_factor'] = [im_scale_x, im_scale_y] * 2 + if 'gt_bbox' in sample and self.resize_box and len(sample[ + 'gt_bbox']) > 0: + bboxes = sample['gt_bbox'] * sample['scale_factor'] + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, resize_w - 1) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, resize_h - 1) + sample['gt_bbox'] = bboxes + if 'semantic' in sample.keys() and sample['semantic'] is not None: + semantic = sample['semantic'] + semantic = cv2.resize( + semantic.astype('float32'), + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=self.interp) + semantic = np.asarray(semantic).astype('int32') + semantic = np.expand_dims(semantic, 0) + sample['semantic'] = semantic + if 'gt_segm' in sample and len(sample['gt_segm']) > 0: + masks = [ + cv2.resize( + gt_segm, + None, + None, + fx=im_scale_x, + fy=im_scale_y, + interpolation=cv2.INTER_NEAREST) + for gt_segm in sample['gt_segm'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + return sample @@ -473,7 +496,6 @@ class RandomFlipImage(BaseOperator): if self.is_mask_flip and len(sample['gt_poly']) != 0: sample['gt_poly'] = self.flip_segms(sample['gt_poly'], height, width) - if 'gt_keypoint' in sample.keys(): sample['gt_keypoint'] = self.flip_keypoint( sample['gt_keypoint'], width) @@ -482,6 +504,9 @@ class RandomFlipImage(BaseOperator): 'semantic'] is not None: sample['semantic'] = sample['semantic'][:, ::-1] + if 'gt_segm' in sample.keys() and sample['gt_segm'] is not None: + sample['gt_segm'] = sample['gt_segm'][:, :, ::-1] + sample['flipped'] = True sample['image'] = im sample = samples if batch_input else samples[0] @@ -2557,3 +2582,41 @@ class DebugVisibleImage(BaseOperator): save_path = os.path.join(self.output_dir, out_file_name) image.save(save_path, quality=95) return sample + + +@register_op +class Poly2Mask(BaseOperator): + """ + gt poly to mask annotations + """ + + def __init__(self): + super(Poly2Mask, self).__init__() + import pycocotools.mask as maskUtils + self.maskutils = maskUtils + + def _poly2mask(self, mask_ann, img_h, img_w): + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w) + rle = self.maskutils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = self.maskutils.decode(rle) + return mask + + def __call__(self, sample, context=None): + assert 'gt_poly' in sample + im_h = sample['h'] + im_w = sample['w'] + masks = [ + self._poly2mask(gt_poly, im_h, im_w) + for gt_poly in sample['gt_poly'] + ] + sample['gt_segm'] = np.asarray(masks).astype(np.uint8) + return sample diff --git a/ppdet/modeling/__init__.py b/ppdet/modeling/__init__.py index f9491d771626848a631804ac3bf663f2ee6ec82c..423a4802ea12b4743e6e61c03edf8b96070560d3 100644 --- a/ppdet/modeling/__init__.py +++ b/ppdet/modeling/__init__.py @@ -22,6 +22,7 @@ from . import roi_extractors from . import roi_heads from . import ops from . import target_assigners +from . import mask_head from .anchor_heads import * from .architectures import * @@ -30,3 +31,4 @@ from .roi_extractors import * from .roi_heads import * from .ops import * from .target_assigners import * +from .mask_head import * diff --git a/ppdet/modeling/anchor_heads/__init__.py b/ppdet/modeling/anchor_heads/__init__.py index 80324aa84faca181edd50546de323d1bbe154369..547999cb3a64a8745424470a32ebe73eeea678eb 100644 --- a/ppdet/modeling/anchor_heads/__init__.py +++ b/ppdet/modeling/anchor_heads/__init__.py @@ -21,6 +21,7 @@ from . import fcos_head from . import corner_head from . import efficient_head from . import ttf_head +from . import solov2_head from .rpn_head import * from .yolo_head import * @@ -29,3 +30,4 @@ from .fcos_head import * from .corner_head import * from .efficient_head import * from .ttf_head import * +from .solov2_head import * diff --git a/ppdet/modeling/anchor_heads/solov2_head.py b/ppdet/modeling/anchor_heads/solov2_head.py new file mode 100644 index 0000000000000000000000000000000000000000..4005f2c9e9f59664cd6e8456021754788835dc76 --- /dev/null +++ b/ppdet/modeling/anchor_heads/solov2_head.py @@ -0,0 +1,531 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.regularizer import L2Decay + +from ppdet.modeling.ops import ConvNorm, DeformConvNorm, MaskMatrixNMS +from ppdet.core.workspace import register + +from ppdet.utils.check import check_version + +from six.moves import zip +import numpy as np + +__all__ = ['SOLOv2Head'] + + +@register +class SOLOv2Head(object): + """ + Head block for SOLOv2 network + + Args: + num_classes (int): Number of output classes. + seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation. + stacked_convs (int): Times of convolution operation. + num_grids (list[int]): List of feature map grids size. + kernel_out_channels (int): Number of output channels in kernel branch. + ins_loss_weight (float): Weight of instance loss. + focal_loss_gamma (float): Gamma parameter for focal loss. + focal_loss_alpha (float): Alpha parameter for focal loss. + dcn_v2_stages (list): Which stage use dcn v2 in tower. + segm_strides (list[int]): List of segmentation area stride. + score_threshold (float): Threshold of categroy score. + update_threshold (float): Updated threshold of categroy score in second time. + pre_nms_top_n (int): Number of total instance to be kept per image before NMS + post_nms_top_n (int): Number of total instance to be kept per image after NMS. + mask_nms (object): MaskMatrixNMS instance. + """ + __inject__ = [] + __shared__ = ['num_classes'] + + def __init__(self, + num_classes=80, + seg_feat_channels=256, + stacked_convs=4, + num_grids=[40, 36, 24, 16, 12], + kernel_out_channels=256, + ins_loss_weight=3.0, + focal_loss_gamma=2.0, + focal_loss_alpha=0.25, + dcn_v2_stages=[], + segm_strides=[8, 8, 16, 32, 32], + score_threshold=0.1, + mask_threshold=0.5, + update_threshold=0.05, + pre_nms_top_n=500, + post_nms_top_n=100, + mask_nms=MaskMatrixNMS( + kernel='gaussian', sigma=2.0).__dict__): + check_version('2.0.0') + self.num_classes = num_classes + self.seg_num_grids = num_grids + self.cate_out_channels = self.num_classes - 1 + self.seg_feat_channels = seg_feat_channels + self.stacked_convs = stacked_convs + self.kernel_out_channels = kernel_out_channels + self.ins_loss_weight = ins_loss_weight + self.focal_loss_gamma = focal_loss_gamma + self.focal_loss_alpha = focal_loss_alpha + self.dcn_v2_stages = dcn_v2_stages + self.segm_strides = segm_strides + self.mask_nms = mask_nms + self.score_threshold = score_threshold + self.mask_threshold = mask_threshold + self.update_threshold = update_threshold + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.conv_type = [ConvNorm, DeformConvNorm] + if isinstance(mask_nms, dict): + self.mask_nms = MaskMatrixNMS(**mask_nms) + + def _conv_pred(self, conv_feat, num_filters, name, name_feat=None): + for i in range(self.stacked_convs): + if i in self.dcn_v2_stages: + conv_func = self.conv_type[1] + else: + conv_func = self.conv_type[0] + conv_feat = conv_func( + input=conv_feat, + num_filters=self.seg_feat_channels, + filter_size=3, + stride=1, + norm_type='gn', + norm_groups=32, + freeze_norm=False, + act='relu', + initializer=fluid.initializer.NormalInitializer(scale=0.01), + norm_name='{}.{}.gn'.format(name, i), + name='{}.{}'.format(name, i)) + if name_feat == 'bbox_head.solo_cate': + bias_init = float(-np.log((1 - 0.01) / 0.01)) + bias_attr = ParamAttr( + name="{}.bias".format(name_feat), + initializer=fluid.initializer.Constant(value=bias_init)) + else: + bias_attr = ParamAttr(name="{}.bias".format(name_feat)) + conv_feat = fluid.layers.conv2d( + input=conv_feat, + num_filters=num_filters, + filter_size=3, + stride=1, + padding=1, + param_attr=ParamAttr( + name="{}.weight".format(name_feat), + initializer=fluid.initializer.NormalInitializer(scale=0.01)), + bias_attr=bias_attr, + name=name + '_feat_') + return conv_feat + + def _points_nms(self, heat, kernel=2): + hmax = fluid.layers.pool2d( + input=heat, pool_size=kernel, pool_type='max', pool_padding=1) + keep = fluid.layers.cast((hmax[:, :, :-1, :-1] == heat), 'float32') + return heat * keep + + def dice_loss(self, input, target): + input = fluid.layers.reshape( + input, shape=(fluid.layers.shape(input)[0], -1)) + target = fluid.layers.reshape( + target, shape=(fluid.layers.shape(target)[0], -1)) + target = fluid.layers.cast(target, 'float32') + a = fluid.layers.reduce_sum(input * target, dim=1) + b = fluid.layers.reduce_sum(input * input, dim=1) + 0.001 + c = fluid.layers.reduce_sum(target * target, dim=1) + 0.001 + d = (2 * a) / (b + c) + return 1 - d + + def _split_feats(self, feats): + return (paddle.nn.functional.interpolate( + feats[0], + scale_factor=0.5, + align_corners=False, + align_mode=0, + mode='bilinear'), feats[1], feats[2], feats[3], + paddle.nn.functional.interpolate( + feats[4], + size=fluid.layers.shape(feats[3])[-2:], + mode='bilinear', + align_corners=False, + align_mode=0)) + + def get_outputs(self, input, is_eval=False, batch_size=1): + """ + Get SOLOv2 head output + + Args: + input (list): List of Variables, output of backbone or neck stages + is_eval (bool): whether in train or test mode + batch_size (int): batch size + Returns: + cate_pred_list (list): Variables of each category branch layer + kernel_pred_list (list): Variables of each kernel branch layer + """ + feats = self._split_feats(input) + cate_pred_list = [] + kernel_pred_list = [] + for idx in range(len(self.seg_num_grids)): + cate_pred, kernel_pred = self._get_output_single( + feats[idx], idx, is_eval=is_eval, batch_size=batch_size) + cate_pred_list.append(cate_pred) + kernel_pred_list.append(kernel_pred) + + return cate_pred_list, kernel_pred_list + + def _get_output_single(self, input, idx, is_eval=False, batch_size=1): + ins_kernel_feat = input + # CoordConv + x_range = paddle.linspace( + -1, 1, fluid.layers.shape(ins_kernel_feat)[-1], dtype='float32') + y_range = paddle.linspace( + -1, 1, fluid.layers.shape(ins_kernel_feat)[-2], dtype='float32') + y, x = paddle.tensor.meshgrid([y_range, x_range]) + x = fluid.layers.unsqueeze(x, [0, 1]) + y = fluid.layers.unsqueeze(y, [0, 1]) + y = fluid.layers.expand(y, expand_times=[batch_size, 1, 1, 1]) + x = fluid.layers.expand(x, expand_times=[batch_size, 1, 1, 1]) + coord_feat = fluid.layers.concat([x, y], axis=1) + ins_kernel_feat = fluid.layers.concat( + [ins_kernel_feat, coord_feat], axis=1) + + # kernel branch + kernel_feat = ins_kernel_feat + seg_num_grid = self.seg_num_grids[idx] + kernel_feat = paddle.nn.functional.interpolate( + kernel_feat, + size=[seg_num_grid, seg_num_grid], + mode='bilinear', + align_corners=False, + align_mode=0) + cate_feat = kernel_feat[:, :-2, :, :] + + kernel_pred = self._conv_pred( + kernel_feat, + self.kernel_out_channels, + name='bbox_head.kernel_convs', + name_feat='bbox_head.solo_kernel') + + # cate branch + cate_pred = self._conv_pred( + cate_feat, + self.cate_out_channels, + name='bbox_head.cate_convs', + name_feat='bbox_head.solo_cate') + + if is_eval: + cate_pred = self._points_nms( + fluid.layers.sigmoid(cate_pred), kernel=2) + cate_pred = fluid.layers.transpose(cate_pred, [0, 2, 3, 1]) + return cate_pred, kernel_pred + + def get_loss(self, + cate_preds, + kernel_preds, + ins_pred, + ins_labels, + cate_labels, + grid_order_list, + fg_num, + grid_offset, + batch_size=1): + """ + Get loss of network of SOLOv2. + + Args: + cate_preds (list): Variable list of categroy branch output. + kernel_preds (list): Variable list of kernel branch output. + ins_pred (list): Variable list of instance branch output. + ins_labels (list): List of instance labels pre batch. + cate_labels (list): List of categroy labels pre batch. + grid_order_list (list): List of index in pre grid. + fg_num (int): Number of positive samples in a mini-batch. + grid_offset (list): List of offset of pre grid. + batch_size: Batch size. + Returns: + loss_ins (Variable): The instance loss Variable of SOLOv2 network. + loss_cate (Variable): The category loss Variable of SOLOv2 network. + """ + new_kernel_preds = [] + grid_offset_list = fluid.layers.split( + grid_offset, num_or_sections=len(grid_order_list), dim=1) + pred_weight_list = [] + for kernel_preds_level, grid_orders_level, grid_offset_level in zip( + kernel_preds, grid_order_list, grid_offset_list): + tmp_list = [] + kernel_pred_weight = [] + start_order_num = fluid.layers.zeros(shape=[1], dtype='int32') + for i in range(batch_size): + reshape_pred = fluid.layers.reshape( + kernel_preds_level[i], + shape=(int(kernel_preds_level[i].shape[0]), -1)) + end_order_num = start_order_num + grid_offset_level[i] + grid_order_img = fluid.layers.slice( + grid_orders_level, + axes=[0], + starts=[start_order_num], + ends=[end_order_num]) + start_order_num = end_order_num + reshape_pred = fluid.layers.transpose(reshape_pred, [1, 0]) + reshape_pred = fluid.layers.gather( + reshape_pred, index=grid_order_img) + reshape_pred = fluid.layers.transpose(reshape_pred, [1, 0]) + tmp_list.append(reshape_pred) + new_kernel_preds.append(tmp_list) + + # generate masks + ins_pred_list = [] + for b_kernel_pred in new_kernel_preds: + b_mask_pred = [] + for idx, kernel_pred in enumerate(b_kernel_pred): + cur_ins_pred = ins_pred[idx] + cur_ins_pred = fluid.layers.unsqueeze(cur_ins_pred, 0) + kernel_pred = fluid.layers.transpose(kernel_pred, [1, 0]) + kernel_pred = fluid.layers.unsqueeze(kernel_pred, [2, 3]) + + ins_pred_conv = paddle.nn.functional.conv2d(cur_ins_pred, + kernel_pred) + cur_ins_pred = ins_pred_conv[0] + b_mask_pred.append(cur_ins_pred) + + b_mask_pred = fluid.layers.concat(b_mask_pred, axis=0) + ins_pred_list.append(b_mask_pred) + + num_ins = fluid.layers.reduce_sum(fg_num) + + # Ues dice_loss to calculate instance loss + loss_ins = [] + total_weights = fluid.layers.zeros(shape=[1], dtype='float32') + for input, target in zip(ins_pred_list, ins_labels): + weights = fluid.layers.cast( + fluid.layers.reduce_sum( + target, dim=[1, 2]) > 0, 'float32') + input = fluid.layers.sigmoid(input) + dice_out = fluid.layers.elementwise_mul( + self.dice_loss(input, target), weights) + total_weights += fluid.layers.reduce_sum(weights) + loss_ins.append(dice_out) + loss_ins = fluid.layers.reduce_sum(fluid.layers.concat( + loss_ins)) / total_weights + loss_ins = loss_ins * self.ins_loss_weight + + # Ues sigmoid_focal_loss to calculate category loss + cate_preds = [ + fluid.layers.reshape( + fluid.layers.transpose(cate_pred, [0, 2, 3, 1]), + shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds + ] + flatten_cate_preds = fluid.layers.concat(cate_preds) + new_cate_labels = [] + cate_labels = fluid.layers.concat(cate_labels) + cate_labels = fluid.layers.unsqueeze(cate_labels, 1) + loss_cate = fluid.layers.sigmoid_focal_loss( + x=flatten_cate_preds, + label=cate_labels, + fg_num=num_ins + 1, + gamma=self.focal_loss_gamma, + alpha=self.focal_loss_alpha) + loss_cate = fluid.layers.reduce_sum(loss_cate) + + return {'loss_ins': loss_ins, 'loss_cate': loss_cate} + + def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_info): + """ + Get prediction result of SOLOv2 network + + Args: + cate_preds (list): List of Variables, output of categroy branch. + kernel_preds (list): List of Variables, output of kernel branch. + seg_pred (list): List of Variables, output of mask head stages. + im_info(Variables): [h, w, scale] for input images. + Returns: + seg_masks (Variable): The prediction segmentation. + cate_labels (Variable): The prediction categroy label of each segmentation. + seg_masks (Variable): The prediction score of each segmentation. + """ + num_levels = len(cate_preds) + featmap_size = fluid.layers.shape(seg_pred)[-2:] + seg_masks_list = [] + cate_labels_list = [] + cate_scores_list = [] + cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds] + kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds] + # Currently only supports batch size == 1 + for idx in range(1): + cate_pred_list = [ + fluid.layers.reshape( + cate_preds[i][idx], shape=(-1, self.cate_out_channels)) + for i in range(num_levels) + ] + seg_pred_list = seg_pred + kernel_pred_list = [ + fluid.layers.reshape( + fluid.layers.transpose(kernel_preds[i][idx], [1, 2, 0]), + shape=(-1, self.kernel_out_channels)) + for i in range(num_levels) + ] + cate_pred_list = fluid.layers.concat(cate_pred_list, axis=0) + kernel_pred_list = fluid.layers.concat(kernel_pred_list, axis=0) + + seg_masks, cate_labels, cate_scores = self.get_seg_single( + cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size, + im_info[idx]) + return { + "segm": seg_masks, + 'cate_label': cate_labels, + 'cate_score': cate_scores + } + + def sort_score(self, scores, top_num): + self.case_scores = scores + + def fn_1(): + return fluid.layers.topk(self.case_scores, top_num) + + def fn_2(): + return fluid.layers.argsort(self.case_scores, descending=True) + + sort_inds = fluid.layers.case( + pred_fn_pairs=[(fluid.layers.shape(scores)[0] > top_num, fn_1)], + default=fn_2) + return sort_inds + + def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size, + im_info): + + im_scale = im_info[2] + h = fluid.layers.cast(im_info[0], 'int32') + w = fluid.layers.cast(im_info[1], 'int32') + upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4) + + inds = fluid.layers.where(cate_preds > self.score_threshold) + cate_preds = fluid.layers.reshape(cate_preds, shape=[-1]) + # Prevent empty and increase fake data + ind_a = fluid.layers.cast(fluid.layers.shape(kernel_preds)[0], 'int64') + ind_b = fluid.layers.zeros(shape=[1], dtype='int64') + inds_end = fluid.layers.unsqueeze( + fluid.layers.concat([ind_a, ind_b]), 0) + inds = fluid.layers.concat([inds, inds_end]) + kernel_preds_end = fluid.layers.ones( + shape=[1, self.kernel_out_channels], dtype='float32') + kernel_preds = fluid.layers.concat([kernel_preds, kernel_preds_end]) + cate_preds = fluid.layers.concat( + [cate_preds, fluid.layers.zeros( + shape=[1], dtype='float32')]) + + # cate_labels & kernel_preds + cate_labels = inds[:, 1] + kernel_preds = fluid.layers.gather(kernel_preds, index=inds[:, 0]) + cate_score_idx = fluid.layers.elementwise_add(inds[:, 0] * 80, + cate_labels) + cate_scores = fluid.layers.gather(cate_preds, index=cate_score_idx) + + size_trans = np.power(self.seg_num_grids, 2) + strides = [] + for _ind in range(len(self.segm_strides)): + strides.append( + fluid.layers.fill_constant( + shape=[int(size_trans[_ind])], + dtype="int32", + value=self.segm_strides[_ind])) + strides = fluid.layers.concat(strides) + strides = fluid.layers.gather(strides, index=inds[:, 0]) + + # mask encoding. + kernel_preds = fluid.layers.unsqueeze(kernel_preds, [2, 3]) + seg_preds = paddle.nn.functional.conv2d(seg_preds, kernel_preds) + seg_preds = fluid.layers.sigmoid(fluid.layers.squeeze(seg_preds, [0])) + seg_masks = seg_preds > self.mask_threshold + seg_masks = fluid.layers.cast(seg_masks, 'float32') + sum_masks = fluid.layers.reduce_sum(seg_masks, dim=[1, 2]) + + keep = fluid.layers.where(sum_masks > strides) + keep = fluid.layers.squeeze(keep, axes=[1]) + # Prevent empty and increase fake data + keep_other = fluid.layers.concat([ + keep, fluid.layers.cast( + fluid.layers.shape(sum_masks)[0] - 1, 'int64') + ]) + keep_scores = fluid.layers.concat([ + keep, fluid.layers.cast(fluid.layers.shape(sum_masks)[0], 'int64') + ]) + cate_scores_end = fluid.layers.zeros(shape=[1], dtype='float32') + cate_scores = fluid.layers.concat([cate_scores, cate_scores_end]) + + seg_masks = fluid.layers.gather(seg_masks, index=keep_other) + seg_preds = fluid.layers.gather(seg_preds, index=keep_other) + sum_masks = fluid.layers.gather(sum_masks, index=keep_other) + cate_labels = fluid.layers.gather(cate_labels, index=keep_other) + cate_scores = fluid.layers.gather(cate_scores, index=keep_scores) + + # mask scoring. + seg_mul = fluid.layers.cast(seg_preds * seg_masks, 'float32') + seg_scores = fluid.layers.reduce_sum(seg_mul, dim=[1, 2]) / sum_masks + cate_scores *= seg_scores + + # sort and keep top nms_pre + sort_inds = self.sort_score(cate_scores, self.pre_nms_top_n) + + seg_masks = fluid.layers.gather(seg_masks, index=sort_inds[1]) + seg_preds = fluid.layers.gather(seg_preds, index=sort_inds[1]) + sum_masks = fluid.layers.gather(sum_masks, index=sort_inds[1]) + cate_scores = sort_inds[0] + cate_labels = fluid.layers.gather(cate_labels, index=sort_inds[1]) + + # Matrix NMS + cate_scores = self.mask_nms( + seg_masks, cate_labels, cate_scores, sum_masks=sum_masks) + + keep = fluid.layers.where(cate_scores >= self.update_threshold) + keep = fluid.layers.squeeze(keep, axes=[1]) + # Prevent empty and increase fake data + keep = fluid.layers.concat([ + keep, fluid.layers.cast( + fluid.layers.shape(cate_scores)[0] - 1, 'int64') + ]) + + seg_preds = fluid.layers.gather(seg_preds, index=keep) + cate_scores = fluid.layers.gather(cate_scores, index=keep) + cate_labels = fluid.layers.gather(cate_labels, index=keep) + + # sort and keep top_k + sort_inds = self.sort_score(cate_scores, self.post_nms_top_n) + + seg_preds = fluid.layers.gather(seg_preds, index=sort_inds[1]) + cate_scores = sort_inds[0] + cate_labels = fluid.layers.gather(cate_labels, index=sort_inds[1]) + ori_shape = im_info[:2] / im_scale + 0.5 + ori_shape = fluid.layers.cast(ori_shape, 'int32') + seg_preds = paddle.nn.functional.interpolate( + fluid.layers.unsqueeze(seg_preds, 0), + size=upsampled_size_out, + mode='bilinear', + align_corners=False, + align_mode=0)[:, :, :h, :w] + seg_masks = fluid.layers.squeeze( + paddle.nn.functional.interpolate( + seg_preds, + size=ori_shape[:2], + mode='bilinear', + align_corners=False, + align_mode=0), + axes=[0]) + seg_masks = fluid.layers.cast(seg_masks > self.mask_threshold, 'int32') + return seg_masks, cate_labels, cate_scores diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py index a8a77bb611983e48baa2e40fe9f9a190cbe171d4..7693a2c1ede4368aef9974917744920c90f9f5b2 100644 --- a/ppdet/modeling/architectures/__init__.py +++ b/ppdet/modeling/architectures/__init__.py @@ -29,6 +29,7 @@ from . import fcos from . import cornernet_squeeze from . import ttfnet from . import htc +from . import solov2 from .faster_rcnn import * from .mask_rcnn import * @@ -45,3 +46,4 @@ from .fcos import * from .cornernet_squeeze import * from .ttfnet import * from .htc import * +from .solov2 import * diff --git a/ppdet/modeling/architectures/solov2.py b/ppdet/modeling/architectures/solov2.py new file mode 100644 index 0000000000000000000000000000000000000000..756a269c4caea164e3c16a554d204c54428becfe --- /dev/null +++ b/ppdet/modeling/architectures/solov2.py @@ -0,0 +1,187 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +from paddle import fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['SOLOv2'] + + +@register +class SOLOv2(object): + """ + SOLOv2 network, see https://arxiv.org/abs/2003.10152 + + Args: + backbone (object): an backbone instance + fpn (object): feature pyramid network instance + bbox_head (object): an `SOLOv2Head` instance + mask_head (object): an `SOLOv2MaskHead` instance + batch_size (int): batch size. + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'fpn', 'bbox_head', 'mask_head'] + + def __init__(self, + backbone, + fpn=None, + bbox_head='SOLOv2Head', + mask_head='SOLOv2MaskHead', + batch_size=1): + super(SOLOv2, self).__init__() + self.backbone = backbone + self.fpn = fpn + self.bbox_head = bbox_head + self.mask_head = mask_head + self.batch_size = batch_size + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + body_feats = self.backbone(im) + + if self.fpn is not None: + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + if isinstance(body_feats, OrderedDict): + body_feat_names = list(body_feats.keys()) + body_feats = [body_feats[name] for name in body_feat_names] + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = [fluid.layers.cast(v, 'float32') for v in body_feats] + + if not mode == 'train': + self.batch_size = 1 + + mask_feat_pred = self.mask_head.get_output(body_feats, self.batch_size) + + if mode == 'train': + ins_labels = [] + cate_labels = [] + grid_orders = [] + fg_num = feed_vars['fg_num'] + grid_offset = feed_vars['grid_offset'] + + for i in range(5): + ins_label = 'ins_label{}'.format(i) + if ins_label in feed_vars: + ins_labels.append(feed_vars[ins_label]) + cate_label = 'cate_label{}'.format(i) + if cate_label in feed_vars: + cate_labels.append(feed_vars[cate_label]) + grid_order = 'grid_order{}'.format(i) + if grid_order in feed_vars: + grid_orders.append(feed_vars[grid_order]) + + cate_preds, kernel_preds = self.bbox_head.get_outputs( + body_feats, batch_size=self.batch_size) + + losses = self.bbox_head.get_loss( + cate_preds, kernel_preds, mask_feat_pred, ins_labels, + cate_labels, grid_orders, fg_num, grid_offset, self.batch_size) + total_loss = fluid.layers.sum(list(losses.values())) + losses.update({'loss': total_loss}) + return losses + else: + im_info = feed_vars['im_info'] + outs = self.bbox_head.get_outputs( + body_feats, is_eval=True, batch_size=self.batch_size) + seg_inputs = outs + (mask_feat_pred, im_info) + return self.bbox_head.get_prediction(*seg_inputs) + + def _inputs_def(self, image_shape, fields): + im_shape = [None] + image_shape + # yapf: disable + inputs_def = { + 'image': {'shape': im_shape, 'dtype': 'float32', 'lod_level': 0}, + 'im_info': {'shape': [None, 3], 'dtype': 'float32', 'lod_level': 0}, + 'im_id': {'shape': [None, 1], 'dtype': 'int64', 'lod_level': 0}, + 'im_shape': {'shape': [None, 3], 'dtype': 'float32', 'lod_level': 0}, + } + + if 'gt_segm' in fields: + targets_def = { + 'ins_label0': {'shape': [None, None, None], 'dtype': 'int32', 'lod_level': 1}, + 'ins_label1': {'shape': [None, None, None], 'dtype': 'int32', 'lod_level': 1}, + 'ins_label2': {'shape': [None, None, None], 'dtype': 'int32', 'lod_level': 1}, + 'ins_label3': {'shape': [None, None, None], 'dtype': 'int32', 'lod_level': 1}, + 'ins_label4': {'shape': [None, None, None], 'dtype': 'int32', 'lod_level': 1}, + 'cate_label0': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'cate_label1': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'cate_label2': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'cate_label3': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'cate_label4': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'grid_order0': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'grid_order1': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'grid_order2': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'grid_order3': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'grid_order4': {'shape': [None], 'dtype': 'int32', 'lod_level': 1}, + 'fg_num': {'shape': [None], 'dtype': 'int32', 'lod_level': 0}, + 'grid_offset': {'shape': [None, 5], 'dtype': 'int32', 'lod_level': 0}, + } + # yapf: enable + inputs_def.update(targets_def) + return inputs_def + + def build_inputs( + self, + image_shape=[3, None, None], + fields=['image', 'im_id', 'gt_segm'], # for train + use_dataloader=True, + iterable=False): + inputs_def = self._inputs_def(image_shape, fields) + if 'gt_segm' in fields: + fields.remove('gt_segm') + fields.extend(['fg_num', 'grid_offset']) + for i in range(5): + fields.extend([ + 'ins_label%d' % i, 'cate_label%d' % i, 'grid_order%d' % i + ]) + + feed_vars = OrderedDict([(key, fluid.data( + name=key, + shape=inputs_def[key]['shape'], + dtype=inputs_def[key]['dtype'], + lod_level=inputs_def[key]['lod_level'])) for key in fields]) + loader = fluid.io.DataLoader.from_generator( + feed_list=list(feed_vars.values()), + capacity=16, + use_double_buffer=True, + iterable=iterable) if use_dataloader else None + return feed_vars, loader + + def train(self, feed_vars): + return self.build(feed_vars, mode='train') + + def eval(self, feed_vars): + return self.build(feed_vars, mode='test') + + def test(self, feed_vars): + return self.build(feed_vars, mode='test') diff --git a/ppdet/modeling/mask_head/__init__.py b/ppdet/modeling/mask_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba2b3b30bd6c7fd8df2a403dc0f0811bac9e6637 --- /dev/null +++ b/ppdet/modeling/mask_head/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from . import solo_mask_head + +from .solo_mask_head import * diff --git a/ppdet/modeling/mask_head/solo_mask_head.py b/ppdet/modeling/mask_head/solo_mask_head.py new file mode 100644 index 0000000000000000000000000000000000000000..58bcb5531e8b670ae174cc5a015fd6ecaa2cadc5 --- /dev/null +++ b/ppdet/modeling/mask_head/solo_mask_head.py @@ -0,0 +1,152 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import fluid + +from ppdet.core.workspace import register +from ppdet.modeling.ops import ConvNorm, DeformConvNorm + +__all__ = ['SOLOv2MaskHead'] + + +@register +class SOLOv2MaskHead(object): + """ + SOLOv2MaskHead + + Args: + out_channels (int): The channel number of output variable. + start_level (int): The position where the input starts. + end_level (int): The position where the input ends. + num_classes (int): Number of classes in SOLOv2MaskHead output. + use_dcn_in_tower: Whether to use dcn in tower or not. + """ + __shared__ = ['num_classes'] + + def __init__(self, + out_channels=128, + start_level=0, + end_level=3, + num_classes=81, + use_dcn_in_tower=False): + super(SOLOv2MaskHead, self).__init__() + assert start_level >= 0 and end_level >= start_level + self.out_channels = out_channels + self.start_level = start_level + self.end_level = end_level + self.num_classes = num_classes + self.use_dcn_in_tower = use_dcn_in_tower + self.conv_type = [ConvNorm, DeformConvNorm] + + def _convs_levels(self, conv_feat, level, name=None): + conv_func = self.conv_type[0] + if self.use_dcn_in_tower: + conv_func = self.conv_type[1] + + if level == 0: + return conv_func( + input=conv_feat, + num_filters=self.out_channels, + filter_size=3, + stride=1, + norm_type='gn', + norm_groups=32, + freeze_norm=False, + act='relu', + initializer=fluid.initializer.NormalInitializer(scale=0.01), + norm_name=name + '.conv' + str(level) + '.gn', + name=name + '.conv' + str(level)) + + for j in range(level): + conv_feat = conv_func( + input=conv_feat, + num_filters=self.out_channels, + filter_size=3, + stride=1, + norm_type='gn', + norm_groups=32, + freeze_norm=False, + act='relu', + initializer=fluid.initializer.NormalInitializer(scale=0.01), + norm_name=name + '.conv' + str(j) + '.gn', + name=name + '.conv' + str(j)) + conv_feat = fluid.layers.resize_bilinear( + conv_feat, + scale=2, + name='upsample' + str(level) + str(j), + align_corners=False, + align_mode=0) + return conv_feat + + def _conv_pred(self, conv_feat): + conv_func = self.conv_type[0] + if self.use_dcn_in_tower: + conv_func = self.conv_type[1] + conv_feat = conv_func( + input=conv_feat, + num_filters=self.num_classes, + filter_size=1, + stride=1, + norm_type='gn', + norm_groups=32, + freeze_norm=False, + act='relu', + initializer=fluid.initializer.NormalInitializer(scale=0.01), + norm_name='mask_feat_head.conv_pred.0.gn', + name='mask_feat_head.conv_pred.0') + + return conv_feat + + def get_output(self, inputs, batch_size=1): + """ + Get SOLOv2MaskHead output. + + Args: + inputs(list[Variable]): feature map from each necks with shape of [N, C, H, W] + batch_size (int): batch size + Returns: + ins_pred(Variable): Output of SOLOv2MaskHead head + """ + range_level = self.end_level - self.start_level + 1 + feature_add_all_level = self._convs_levels( + inputs[0], 0, name='mask_feat_head.convs_all_levels.0') + for i in range(1, range_level): + input_p = inputs[i] + if i == 3: + input_feat = input_p + x_range = paddle.linspace( + -1, 1, fluid.layers.shape(input_feat)[-1], dtype='float32') + y_range = paddle.linspace( + -1, 1, fluid.layers.shape(input_feat)[-2], dtype='float32') + y, x = paddle.tensor.meshgrid([y_range, x_range]) + x = fluid.layers.unsqueeze(x, [0, 1]) + y = fluid.layers.unsqueeze(y, [0, 1]) + y = fluid.layers.expand(y, expand_times=[batch_size, 1, 1, 1]) + x = fluid.layers.expand(x, expand_times=[batch_size, 1, 1, 1]) + coord_feat = fluid.layers.concat([x, y], axis=1) + input_p = fluid.layers.concat([input_p, coord_feat], axis=1) + feature_add_all_level = fluid.layers.elementwise_add( + feature_add_all_level, + self._convs_levels( + input_p, + i, + name='mask_feat_head.convs_all_levels.{}'.format(i))) + ins_pred = self._conv_pred(feature_add_all_level) + + return ins_pred diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py index a288e5de97321c1fb8f0455b2918a7be660c0be3..86be31829bc53d23e759922fd1d12b0b70b94697 100644 --- a/ppdet/modeling/ops.py +++ b/ppdet/modeling/ops.py @@ -17,6 +17,7 @@ from numbers import Integral import math import six +import paddle from paddle import fluid from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.initializer import NumpyArrayInitializer @@ -1263,27 +1264,27 @@ class LibraBBoxAssigner(object): rois = create_tmp_var( fluid.default_main_program(), - name=None, #'rois', + name=None, dtype='float32', shape=[-1, 4], ) bbox_inside_weights = create_tmp_var( fluid.default_main_program(), - name=None, #'bbox_inside_weights', + name=None, dtype='float32', shape=[-1, 8 if self.is_cls_agnostic else self.class_nums * 4], ) bbox_outside_weights = create_tmp_var( fluid.default_main_program(), - name=None, #'bbox_outside_weights', + name=None, dtype='float32', shape=[-1, 8 if self.is_cls_agnostic else self.class_nums * 4], ) bbox_targets = create_tmp_var( fluid.default_main_program(), - name=None, #'bbox_targets', + name=None, dtype='float32', shape=[-1, 8 if self.is_cls_agnostic else self.class_nums * 4], ) labels_int32 = create_tmp_var( fluid.default_main_program(), - name=None, #'labels_int32', + name=None, dtype='int32', shape=[-1, 1], ) @@ -1565,3 +1566,79 @@ class RetinaOutputDecoder(object): self.nms_top_k = pre_nms_top_n self.keep_top_k = detections_per_im self.nms_eta = nms_eta + + +@register +@serializable +class MaskMatrixNMS(object): + """ + Matrix NMS for multi-class masks. + Args: + kernel (str): 'linear' or 'gaussian' + sigma (float): std in gaussian method + Input: + seg_masks (Variable): shape (n, h, w), segmentation feature maps + cate_labels (Variable): shape (n), mask labels in descending order + cate_scores (Variable): shape (n), mask scores in descending order + sum_masks (Variable): The sum of seg_masks + Returns: + Variable: cate_scores_update, tensors of shape (n) + """ + + def __init__(self, kernel='gaussian', sigma=2.0): + super(MaskMatrixNMS, self).__init__() + self.kernel = kernel + self.sigma = sigma + + def __call__(self, seg_masks, cate_labels, cate_scores, sum_masks=None): + n_samples = fluid.layers.shape(cate_labels) + seg_masks = fluid.layers.reshape(seg_masks, shape=(n_samples, -1)) + # inter. + inter_matrix = paddle.mm(seg_masks, + fluid.layers.transpose(seg_masks, [1, 0])) + # union. + sum_masks_x = fluid.layers.reshape( + fluid.layers.expand( + sum_masks, expand_times=[n_samples]), + shape=[n_samples, n_samples]) + # iou. + iou_matrix = (inter_matrix / (sum_masks_x + fluid.layers.transpose( + sum_masks_x, [1, 0]) - inter_matrix)) + iou_matrix = paddle.tensor.triu(iou_matrix, diagonal=1) + # label_specific matrix. + cate_labels_x = fluid.layers.reshape( + fluid.layers.expand( + cate_labels, expand_times=[n_samples]), + shape=[n_samples, n_samples]) + label_matrix = fluid.layers.cast( + (cate_labels_x == fluid.layers.transpose(cate_labels_x, [1, 0])), + 'float32') + label_matrix = paddle.tensor.triu(label_matrix, diagonal=1) + + # IoU compensation + compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) + compensate_iou = fluid.layers.reshape( + fluid.layers.expand( + compensate_iou, expand_times=[n_samples]), + shape=[n_samples, n_samples]) + compensate_iou = fluid.layers.transpose(compensate_iou, [1, 0]) + + # IoU decay + decay_iou = iou_matrix * label_matrix + + # matrix nms + if self.kernel == 'gaussian': + decay_matrix = fluid.layers.exp(-1 * self.sigma * (decay_iou**2)) + compensate_matrix = fluid.layers.exp(-1 * self.sigma * + (compensate_iou**2)) + decay_coefficient = paddle.min(decay_matrix / compensate_matrix, + axis=0) + elif self.kernel == 'linear': + decay_matrix = (1 - decay_iou) / (1 - compensate_iou) + decay_coefficient = paddle.min(decay_matrix, axis=0) + else: + raise NotImplementedError + + # update the score. + cate_scores_update = cate_scores * decay_coefficient + return cate_scores_update diff --git a/ppdet/utils/coco_eval.py b/ppdet/utils/coco_eval.py index b54be135c76a33dcb5471b16f0697a734e050259..02a580e9fc25a8e238fa4e7ebde9da79cab13dc2 100644 --- a/ppdet/utils/coco_eval.py +++ b/ppdet/utils/coco_eval.py @@ -164,6 +164,47 @@ def mask_eval(results, cocoapi_eval(outfile, 'segm', coco_gt=coco_gt) +def segm_eval(results, anno_file, outfile, save_only=False): + assert 'segm' in results[0] + assert outfile.endswith('.json') + from pycocotools.coco import COCO + coco_gt = COCO(anno_file) + clsid2catid = {i: v for i, v in enumerate(coco_gt.getCatIds())} + segm_results = [] + for t in results: + im_id = int(t['im_id'][0][0]) + segs = t['segm'] + for mask in segs: + catid = int(clsid2catid[mask[0]]) + masks = mask[1] + mask_score = masks[1] + segm = masks[0] + segm['counts'] = segm['counts'].decode('utf8') + coco_res = { + 'image_id': im_id, + 'category_id': catid, + 'segmentation': segm, + 'score': mask_score + } + segm_results.append(coco_res) + + if len(segm_results) == 0: + logger.warning("The number of valid mask detected is zero.\n \ + Please use reasonable model and check input data.") + return + + with open(outfile, 'w') as f: + json.dump(segm_results, f) + + if save_only: + logger.info('The mask result is saved to {} and do not ' + 'evaluate the mAP.'.format(outfile)) + return + + map_stats = cocoapi_eval(outfile, 'segm', coco_gt=coco_gt) + return map_stats + + def cocoapi_eval(jsonfile, style, coco_gt=None, @@ -374,6 +415,43 @@ def mask2out(results, clsid2catid, resolution, thresh_binarize=0.5): return segm_res +def segm2out(results, clsid2catid, thresh_binarize=0.5): + import pycocotools.mask as mask_util + segm_res = [] + + # for each batch + for t in results: + segms = t['segm'][0] + clsid_labels = t['cate_label'][0] + clsid_scores = t['cate_score'][0] + lengths = segms.shape[0] + im_id = int(t['im_id'][0][0]) + im_shape = t['im_shape'][0][0] + if lengths == 0 or segms is None: + continue + # for each sample + for i in range(lengths - 1): + im_h = int(im_shape[0]) + im_w = int(im_shape[1]) + + clsid = int(clsid_labels[i]) + catid = clsid2catid[clsid] + score = clsid_scores[i] + mask = segms[i] + segm = mask_util.encode( + np.array( + mask[:, :, np.newaxis], order='F'))[0] + segm['counts'] = segm['counts'].decode('utf8') + coco_res = { + 'image_id': im_id, + 'category_id': catid, + 'segmentation': segm, + 'score': score + } + segm_res.append(coco_res) + return segm_res + + def expand_boxes(boxes, scale): """ Expand an array of boxes by a given scale. diff --git a/ppdet/utils/eval_utils.py b/ppdet/utils/eval_utils.py index ef3d11da75edd531293266df140b7992738f71ca..d6769bbcf2056f2de4da1a3f067c14b4799a8ab4 100644 --- a/ppdet/utils/eval_utils.py +++ b/ppdet/utils/eval_utils.py @@ -94,6 +94,25 @@ def clean_res(result, keep_name_list): return clean_result +def get_masks(result): + import pycocotools.mask as mask_util + if result is None: + return {} + seg_pred = result['segm'][0].astype(np.uint8) + cate_label = result['cate_label'][0].astype(np.int) + cate_score = result['cate_score'][0].astype(np.float) + num_ins = seg_pred.shape[0] + masks = [] + for idx in range(num_ins - 1): + cur_mask = seg_pred[idx, ...] + rle = mask_util.encode( + np.array( + cur_mask[:, :, np.newaxis], order='F'))[0] + rst = (rle, cate_score[idx]) + masks.append([cate_label[idx], rst]) + return masks + + def eval_run(exe, compile_program, loader, @@ -163,11 +182,13 @@ def eval_run(exe, corner_post_process(res, post_config, cfg.num_classes) if 'TTFNet' in cfg.architecture: res['bbox'][1].append([len(res['bbox'][0])]) + if 'segm' in res: + res['segm'] = get_masks(res) results.append(res) if iter_id % 100 == 0: logger.info('Test iter {}'.format(iter_id)) iter_id += 1 - if len(res['bbox'][1]) == 0: + if 'bbox' not in res or len(res['bbox'][1]) == 0: has_bbox = False images_num += len(res['bbox'][1][0]) if has_bbox else 1 except (StopIteration, fluid.core.EOFException): @@ -198,7 +219,7 @@ def eval_results(results, """Evaluation for evaluation program results""" box_ap_stats = [] if metric == 'COCO': - from ppdet.utils.coco_eval import proposal_eval, bbox_eval, mask_eval + from ppdet.utils.coco_eval import proposal_eval, bbox_eval, mask_eval, segm_eval anno_file = dataset.get_anno() with_background = dataset.with_background if 'proposal' in results[0]: @@ -225,6 +246,14 @@ def eval_results(results, output = os.path.join(output_directory, 'mask.json') mask_eval( results, anno_file, output, resolution, save_only=save_only) + if 'segm' in results[0]: + output = 'segm.json' + if output_directory: + output = os.path.join(output_directory, output) + mask_ap_stats = segm_eval( + results, anno_file, output, save_only=save_only) + if len(box_ap_stats) == 0: + box_ap_stats = mask_ap_stats else: if 'accum_map' in results[-1]: res = np.mean(results[-1]['accum_map'][0]) diff --git a/tools/eval.py b/tools/eval.py index 84da9b49ade12ce42bed5c1d3e2ee7053c373cb1..bb3b41e209b778044d472cbf1d950b78595defa3 100644 --- a/tools/eval.py +++ b/tools/eval.py @@ -132,9 +132,6 @@ def main(): extra_keys) sub_eval_prog = sub_eval_prog.clone(True) - #if 'weights' in cfg: - # checkpoint.load_params(exe, sub_eval_prog, cfg.weights) - # load model exe.run(startup_prog) if 'weights' in cfg: @@ -146,7 +143,6 @@ def main(): results = eval_run(exe, compile_program, loader, keys, values, cls, cfg, sub_eval_prog, sub_keys, sub_values, resolution) - #print(cfg['EvalReader']['dataset'].__dict__) # evaluation # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' diff --git a/tools/export_model.py b/tools/export_model.py index 1cf6549c7aa8c526e17d997c766817c2a7d6260f..793ed85f40dca62b147ccf573443f4fe5528be69 100644 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -46,11 +46,13 @@ TRT_MIN_SUBGRAPH = { 'Face': 3, 'TTFNet': 3, 'FCOS': 3, + 'SOLOv2': 3, } RESIZE_SCALE_SET = { 'RCNN', 'RetinaNet', 'FCOS', + 'SOLOv2', }