From c65eb1b52a1fd7414ec452cf234418573335a4d3 Mon Sep 17 00:00:00 2001 From: CodesFarmer Date: Thu, 9 Apr 2020 12:06:31 +0800 Subject: [PATCH] fcos initialization, with github user as committer (#405) * fcos initialization * delete the codes not used * pre-commit the committed code * delete the unused function * modify the capacity in loader from 64 to 16 --- configs/fcos_r50_fpn_1x.yml | 181 +++++++++++ configs/fcos_r50_fpn_multiscale_2x.yml | 181 +++++++++++ ppdet/data/transform/batch_operators.py | 205 ++++++++++++- ppdet/modeling/anchor_heads/__init__.py | 2 + ppdet/modeling/anchor_heads/fcos_head.py | 375 +++++++++++++++++++++++ ppdet/modeling/architectures/__init__.py | 2 + ppdet/modeling/architectures/fcos.py | 184 +++++++++++ ppdet/modeling/backbones/fpn.py | 9 +- ppdet/modeling/losses/__init__.py | 2 + ppdet/modeling/losses/fcos_loss.py | 205 +++++++++++++ ppdet/modeling/losses/iou_loss.py | 5 - ppdet/modeling/ops.py | 10 +- 12 files changed, 1352 insertions(+), 9 deletions(-) create mode 100644 configs/fcos_r50_fpn_1x.yml create mode 100644 configs/fcos_r50_fpn_multiscale_2x.yml create mode 100644 ppdet/modeling/anchor_heads/fcos_head.py create mode 100644 ppdet/modeling/architectures/fcos.py create mode 100644 ppdet/modeling/losses/fcos_loss.py diff --git a/configs/fcos_r50_fpn_1x.yml b/configs/fcos_r50_fpn_1x.yml new file mode 100644 index 000000000..5ce9b7ce7 --- /dev/null +++ b/configs/fcos_r50_fpn_1x.yml @@ -0,0 +1,181 @@ +architecture: FCOS +max_iters: 90000 +use_gpu: true +snapshot_iter: 10000 +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/fcos_r50_fpn_1x/model_final +num_classes: 81 + +FCOS: + backbone: ResNet + fpn: FPN + fcos_head: FCOSHead + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 50 + feature_maps: [3, 4, 5] + freeze_at: 2 + +FPN: + min_level: 3 + max_level: 7 + num_chan: 256 + use_c5: false + spatial_scale: [0.03125, 0.0625, 0.125] + has_extra_convs: true + +FCOSHead: + num_classes: 81 + fpn_stride: [8, 16, 32, 64, 128] + num_convs: 4 + norm_type: "gn" + fcos_loss: FCOSLoss + norm_reg_targets: True + centerness_on_reg: True + use_dcn_in_tower: False + nms: MultiClassNMS + +MultiClassNMS: + score_threshold: 0.025 + nms_top_k: 1000 + keep_top_k: 100 + nms_threshold: 0.6 + background_label: -1 + +FCOSLoss: + loss_alpha: 0.25 + loss_gamma: 2.0 + iou_loss_type: "giou" + reg_weights: 1.0 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +TrainReader: + inputs_def: + fields: ['image', 'gt_bbox', 'gt_class', 'gt_score', 'im_info'] + dataset: + !COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + with_background: true + sample_transforms: + - !DecodeImage + to_rgb: true + - !RandomFlipImage + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + target_size: 800 + max_size: 1333 + interp: 1 + use_cv2: true + - !Permute + to_bgr: false + channel_first: true + batch_transforms: + - !PadBatch + pad_to_stride: 128 + use_padded_im_info: false + - !Gt2FCOSTarget + object_sizes_boundary: [64, 128, 256, 512] + center_sampling_radius: 1.5 + downsample_ratios: [8, 16, 32, 64, 128] + norm_reg_targets: True + batch_size: 2 + shuffle: true + worker_num: 16 + use_process: false + +EvalReader: + inputs_def: + fields: ['image', 'im_id', 'im_shape', 'im_info'] + dataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + with_background: false + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + target_size: 800 + max_size: 1333 + interp: 1 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 128 + use_padded_im_info: true + batch_size: 8 + shuffle: false + worker_num: 2 + use_process: false + +TestReader: + inputs_def: + # set image_shape if needed + fields: ['image', 'im_id', 'im_shape', 'im_info'] + dataset: + !ImageFolder + anno_path: annotations/instances_val2017.json + with_background: false + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + interp: 1 + max_size: 1333 + target_size: 800 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 128 + use_padded_im_info: true + batch_size: 1 + shuffle: false diff --git a/configs/fcos_r50_fpn_multiscale_2x.yml b/configs/fcos_r50_fpn_multiscale_2x.yml new file mode 100644 index 000000000..f9369e313 --- /dev/null +++ b/configs/fcos_r50_fpn_multiscale_2x.yml @@ -0,0 +1,181 @@ +architecture: FCOS +max_iters: 180000 +use_gpu: true +snapshot_iter: 20000 +log_smooth_window: 20 +log_iter: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_cos_pretrained.tar +metric: COCO +weights: output/fcos_r50_fpn_multiscale_2x/model_final +num_classes: 81 + +FCOS: + backbone: ResNet + fpn: FPN + fcos_head: FCOSHead + +ResNet: + norm_type: affine_channel + norm_decay: 0. + depth: 50 + feature_maps: [3, 4, 5] + freeze_at: 2 + +FPN: + min_level: 3 + max_level: 7 + num_chan: 256 + use_c5: false + spatial_scale: [0.03125, 0.0625, 0.125] + has_extra_convs: true + +FCOSHead: + num_classes: 81 + fpn_stride: [8, 16, 32, 64, 128] + num_convs: 4 + norm_type: "gn" + fcos_loss: FCOSLoss + norm_reg_targets: True + centerness_on_reg: True + use_dcn_in_tower: False + nms: MultiClassNMS + +MultiClassNMS: + score_threshold: 0.025 + nms_top_k: 1000 + keep_top_k: 100 + nms_threshold: 0.6 + background_label: -1 + +FCOSLoss: + loss_alpha: 0.25 + loss_gamma: 2.0 + iou_loss_type: "giou" + reg_weights: 1.0 + +LearningRate: + base_lr: 0.01 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [120000, 160000] + - !LinearWarmup + start_factor: 0.3333333333333333 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +TrainReader: + inputs_def: + fields: ['image', 'gt_bbox', 'gt_class', 'gt_score', 'im_info'] + dataset: + !COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + with_background: true + sample_transforms: + - !DecodeImage + to_rgb: true + - !RandomFlipImage + prob: 0.5 + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + target_size: [640, 672, 704, 736, 768, 800] + max_size: 1333 + interp: 1 + use_cv2: true + - !Permute + to_bgr: false + channel_first: true + batch_transforms: + - !PadBatch + pad_to_stride: 128 + use_padded_im_info: false + - !Gt2FCOSTarget + object_sizes_boundary: [64, 128, 256, 512] + center_sampling_radius: 1.5 + downsample_ratios: [8, 16, 32, 64, 128] + norm_reg_targets: True + batch_size: 2 + shuffle: true + worker_num: 16 + use_process: false + +EvalReader: + inputs_def: + fields: ['image', 'im_id', 'im_shape', 'im_info'] + dataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + with_background: false + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + target_size: 800 + max_size: 1333 + interp: 1 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 128 + use_padded_im_info: true + batch_size: 8 + shuffle: false + worker_num: 2 + use_process: false + +TestReader: + inputs_def: + # set image_shape if needed + fields: ['image', 'im_id', 'im_shape', 'im_info'] + dataset: + !ImageFolder + anno_path: annotations/instances_val2017.json + with_background: false + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + interp: 1 + max_size: 1333 + target_size: 800 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 128 + use_padded_im_info: true + batch_size: 1 + shuffle: false diff --git a/ppdet/data/transform/batch_operators.py b/ppdet/data/transform/batch_operators.py index ff1035f0f..525c20bf0 100644 --- a/ppdet/data/transform/batch_operators.py +++ b/ppdet/data/transform/batch_operators.py @@ -30,7 +30,10 @@ from .op_helper import jaccard_overlap logger = logging.getLogger(__name__) -__all__ = ['PadBatch', 'RandomShape', 'PadMultiScaleTest', 'Gt2YoloTarget'] +__all__ = [ + 'PadBatch', 'RandomShape', 'PadMultiScaleTest', 'Gt2YoloTarget', + 'Gt2FCOSTarget' +] @register_op @@ -245,3 +248,203 @@ class Gt2YoloTarget(BaseOperator): target[best_n, 6 + cls, gj, gi] = 1. sample['target{}'.format(i)] = target return samples + + +@register_op +class Gt2FCOSTarget(BaseOperator): + """ + Generate FCOS targets by groud truth data + """ + + def __init__(self, + object_sizes_boundary, + center_sampling_radius, + downsample_ratios, + norm_reg_targets=False): + super(Gt2FCOSTarget, self).__init__() + self.center_sampling_radius = center_sampling_radius + self.downsample_ratios = downsample_ratios + self.INF = np.inf + self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF] + object_sizes_of_interest = [] + for i in range(len(self.object_sizes_boundary) - 1): + object_sizes_of_interest.append([ + self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1] + ]) + self.object_sizes_of_interest = object_sizes_of_interest + self.norm_reg_targets = norm_reg_targets + + def _compute_points(self, w, h): + """ + compute the corresponding points in each feature map + :param h: image height + :param w: image width + :return: points from all feature map + """ + locations = [] + for stride in self.downsample_ratios: + shift_x = np.arange(0, w, stride).astype(np.float32) + shift_y = np.arange(0, h, stride).astype(np.float32) + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shift_x = shift_x.flatten() + shift_y = shift_y.flatten() + location = np.stack([shift_x, shift_y], axis=1) + stride // 2 + locations.append(location) + num_points_each_level = [len(location) for location in locations] + locations = np.concatenate(locations, axis=0) + return locations, num_points_each_level + + def _convert_xywh2xyxy(self, gt_bbox, w, h): + """ + convert the bounding box from style xywh to xyxy + :param gt_bbox: bounding boxes normalized into [0, 1] + :param w: image width + :param h: image height + :return: bounding boxes in xyxy style + """ + bboxes = gt_bbox.copy() + bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w + bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h + bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] + bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] + return bboxes + + def _check_inside_boxes_limited(self, gt_bbox, xs, ys, + num_points_each_level): + """ + check if points is within the clipped boxes + :param gt_bbox: bounding boxes + :param xs: horizontal coordinate of points + :param ys: vertical coordinate of points + :return: the mask of points is within gt_box or not + """ + bboxes = np.reshape( + gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]]) + bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1]) + ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2 + ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2 + beg = 0 + clipped_box = bboxes.copy() + for lvl, stride in enumerate(self.downsample_ratios): + end = beg + num_points_each_level[lvl] + stride_exp = self.center_sampling_radius * stride + clipped_box[beg:end, :, 0] = np.maximum( + bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp) + clipped_box[beg:end, :, 1] = np.maximum( + bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp) + clipped_box[beg:end, :, 2] = np.minimum( + bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp) + clipped_box[beg:end, :, 3] = np.minimum( + bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp) + beg = end + l_res = xs - clipped_box[:, :, 0] + r_res = clipped_box[:, :, 2] - xs + t_res = ys - clipped_box[:, :, 1] + b_res = clipped_box[:, :, 3] - ys + clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) + inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0 + return inside_gt_box + + def __call__(self, samples, context=None): + assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \ + "object_sizes_of_interest', and 'downsample_ratios' should have same length." + + for sample in samples: + # im, gt_bbox, gt_class, gt_score = sample + im = sample['image'] + im_info = sample['im_info'] + bboxes = sample['gt_bbox'] + gt_class = sample['gt_class'] + gt_score = sample['gt_score'] + bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * np.floor(im_info[1]) / \ + np.floor(im_info[1] / im_info[2]) + bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * np.floor(im_info[0]) / \ + np.floor(im_info[0] / im_info[2]) + # calculate the locations + h, w = sample['image'].shape[1:3] + points, num_points_each_level = self._compute_points(w, h) + object_scale_exp = [] + for i, num_pts in enumerate(num_points_each_level): + object_scale_exp.append( + np.tile( + np.array([self.object_sizes_of_interest[i]]), + reps=[num_pts, 1])) + object_scale_exp = np.concatenate(object_scale_exp, axis=0) + + gt_area = (bboxes[:, 2] - bboxes[:, 0]) * ( + bboxes[:, 3] - bboxes[:, 1]) + xs, ys = points[:, 0], points[:, 1] + xs = np.reshape(xs, newshape=[xs.shape[0], 1]) + xs = np.tile(xs, reps=[1, bboxes.shape[0]]) + ys = np.reshape(ys, newshape=[ys.shape[0], 1]) + ys = np.tile(ys, reps=[1, bboxes.shape[0]]) + + l_res = xs - bboxes[:, 0] + r_res = bboxes[:, 2] - xs + t_res = ys - bboxes[:, 1] + b_res = bboxes[:, 3] - ys + reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) + if self.center_sampling_radius > 0: + is_inside_box = self._check_inside_boxes_limited( + bboxes, xs, ys, num_points_each_level) + else: + is_inside_box = np.min(reg_targets, axis=2) > 0 + # check if the targets is inside the corresponding level + max_reg_targets = np.max(reg_targets, axis=2) + lower_bound = np.tile( + np.expand_dims( + object_scale_exp[:, 0], axis=1), + reps=[1, max_reg_targets.shape[1]]) + high_bound = np.tile( + np.expand_dims( + object_scale_exp[:, 1], axis=1), + reps=[1, max_reg_targets.shape[1]]) + is_match_current_level = \ + (max_reg_targets > lower_bound) & \ + (max_reg_targets < high_bound) + points2gtarea = np.tile( + np.expand_dims( + gt_area, axis=0), reps=[xs.shape[0], 1]) + points2gtarea[is_inside_box == 0] = self.INF + points2gtarea[is_match_current_level == 0] = self.INF + points2min_area = points2gtarea.min(axis=1) + points2min_area_ind = points2gtarea.argmin(axis=1) + labels = gt_class[points2min_area_ind] + labels[points2min_area == self.INF] = 0 + reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind] + ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \ + reg_targets[:, [0, 2]].max(axis=1)) * \ + (reg_targets[:, [1, 3]].min(axis=1) / \ + reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32) + ctn_targets = np.reshape( + ctn_targets, newshape=[ctn_targets.shape[0], 1]) + ctn_targets[labels <= 0] = 0 + pos_ind = np.nonzero(labels != 0) + reg_targets_pos = reg_targets[pos_ind[0], :] + split_sections = [] + beg = 0 + for lvl in range(len(num_points_each_level)): + end = beg + num_points_each_level[lvl] + split_sections.append(end) + beg = end + labels_by_level = np.split(labels, split_sections, axis=0) + reg_targets_by_level = np.split(reg_targets, split_sections, axis=0) + ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0) + for lvl in range(len(self.downsample_ratios)): + grid_w = int(np.ceil(w / self.downsample_ratios[lvl])) + grid_h = int(np.ceil(h / self.downsample_ratios[lvl])) + if self.norm_reg_targets: + sample['reg_target{}'.format(lvl)] = \ + np.reshape( + reg_targets_by_level[lvl] / \ + self.downsample_ratios[lvl], + newshape=[grid_h, grid_w, 4]) + else: + sample['reg_target{}'.format(lvl)] = np.reshape( + reg_targets_by_level[lvl], + newshape=[grid_h, grid_w, 4]) + sample['labels{}'.format(lvl)] = np.reshape( + labels_by_level[lvl], newshape=[grid_h, grid_w, 1]) + sample['centerness{}'.format(lvl)] = np.reshape( + ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1]) + return samples diff --git a/ppdet/modeling/anchor_heads/__init__.py b/ppdet/modeling/anchor_heads/__init__.py index 1ed22160d..c6e495598 100644 --- a/ppdet/modeling/anchor_heads/__init__.py +++ b/ppdet/modeling/anchor_heads/__init__.py @@ -17,7 +17,9 @@ from __future__ import absolute_import from . import rpn_head from . import yolo_head from . import retina_head +from . import fcos_head from .rpn_head import * from .yolo_head import * from .retina_head import * +from .fcos_head import * diff --git a/ppdet/modeling/anchor_heads/fcos_head.py b/ppdet/modeling/anchor_heads/fcos_head.py new file mode 100644 index 000000000..de0b76410 --- /dev/null +++ b/ppdet/modeling/anchor_heads/fcos_head.py @@ -0,0 +1,375 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer +from paddle.fluid.regularizer import L2Decay +from ppdet.modeling.ops import ConvNorm +from ppdet.modeling.ops import MultiClassNMS + +from ppdet.core.workspace import register + +__all__ = ['FCOSHead'] + + +@register +class FCOSHead(object): + """ + FCOSHead + Args: + num_classes (int): Number of classes + fpn_stride (list): The stride of each FPN Layer + prior_prob (float): Used to set the bias init for the class prediction layer + num_convs (int): The layer number in fcos head + norm_type (str): Normalization type, 'bn'/'sync_bn'/'affine_channel' + fcos_loss (object): Instance of 'FCOSLoss' + norm_reg_targets (bool): Normalization the regression target if true + centerness_on_reg(bool): The prediction of centerness on regression or clssification branch + use_dcn_in_tower (bool): Ues deformable conv on FCOSHead if true + nms (object): Instance of 'MultiClassNMS' + """ + __inject__ = ['fcos_loss', 'nms'] + __shared__ = ['num_classes'] + + def __init__(self, + num_classes=81, + fpn_stride=[8, 16, 32, 64, 128], + prior_prob=0.01, + num_convs=4, + norm_type="gn", + fcos_loss=None, + norm_reg_targets=False, + centerness_on_reg=False, + use_dcn_in_tower=False, + nms=MultiClassNMS( + score_threshold=0.01, + nms_top_k=1000, + keep_top_k=100, + nms_threshold=0.45, + background_label=-1).__dict__): + self.num_classes = num_classes - 1 + self.fpn_stride = fpn_stride[::-1] + self.prior_prob = prior_prob + self.num_convs = num_convs + self.norm_reg_targets = norm_reg_targets + self.centerness_on_reg = centerness_on_reg + self.use_dcn_in_tower = use_dcn_in_tower + self.norm_type = norm_type + self.fcos_loss = fcos_loss + self.batch_size = 8 + self.nms = nms + if isinstance(nms, dict): + self.nms = MultiClassNMS(**nms) + + def _fcos_head(self, features, fpn_stride, fpn_scale, is_training=False): + """ + Args: + features (Variables): feature map from FPN + fpn_stride (int): the stride of current feature map + is_training (bool): whether is train or test mode + """ + subnet_blob_cls = features + subnet_blob_reg = features + in_channles = features.shape[1] + for lvl in range(0, self.num_convs): + conv_cls_name = 'fcos_head_cls_tower_conv_{}'.format(lvl) + subnet_blob_cls = ConvNorm( + input=subnet_blob_cls, + num_filters=in_channles, + filter_size=3, + stride=1, + norm_type=self.norm_type, + act='relu', + initializer=Normal( + loc=0., scale=0.01), + bias_attr=True, + norm_name=conv_cls_name + "_norm", + name=conv_cls_name) + conv_reg_name = 'fcos_head_reg_tower_conv_{}'.format(lvl) + subnet_blob_reg = ConvNorm( + input=subnet_blob_reg, + num_filters=in_channles, + filter_size=3, + stride=1, + norm_type=self.norm_type, + act='relu', + initializer=Normal( + loc=0., scale=0.01), + bias_attr=True, + norm_name=conv_reg_name + "_norm", + name=conv_reg_name) + conv_cls_name = "fcos_head_cls" + bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) + cls_logits = fluid.layers.conv2d( + input=subnet_blob_cls, + num_filters=self.num_classes, + filter_size=3, + stride=1, + padding=1, + param_attr=ParamAttr( + name=conv_cls_name + "_weights", + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=conv_cls_name + "_bias", + initializer=Constant(value=bias_init_value)), + name=conv_cls_name) + conv_reg_name = "fcos_head_reg" + bbox_reg = fluid.layers.conv2d( + input=subnet_blob_reg, + num_filters=4, + filter_size=3, + stride=1, + padding=1, + param_attr=ParamAttr( + name=conv_reg_name + "_weights", + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=conv_reg_name + "_bias", initializer=Constant(value=0)), + name=conv_reg_name) + bbox_reg = bbox_reg * fpn_scale + if self.norm_reg_targets: + bbox_reg = fluid.layers.relu(bbox_reg) + if not is_training: + bbox_reg = bbox_reg * fpn_stride + else: + bbox_reg = fluid.layers.exp(bbox_reg) + + conv_centerness_name = "fcos_head_centerness" + if self.centerness_on_reg: + subnet_blob_ctn = subnet_blob_reg + else: + subnet_blob_ctn = subnet_blob_cls + centerness = fluid.layers.conv2d( + input=subnet_blob_ctn, + num_filters=1, + filter_size=3, + stride=1, + padding=1, + param_attr=ParamAttr( + name=conv_centerness_name + "_weights", + initializer=Normal( + loc=0., scale=0.01)), + bias_attr=ParamAttr( + name=conv_centerness_name + "_bias", + initializer=Constant(value=0)), + name=conv_centerness_name) + return cls_logits, bbox_reg, centerness + + def _get_output(self, body_feats, is_training=False): + """ + Args: + body_feates (list): the list of fpn feature maps + is_training (bool): whether is train or test mode + Return: + cls_logits (Variables): prediction for classification + bboxes_reg (Variables): prediction for bounding box + centerness (Variables): prediction for ceterness + """ + cls_logits = [] + bboxes_reg = [] + centerness = [] + assert len(body_feats) == len(self.fpn_stride), \ + "The size of body_feats is not equal to size of fpn_stride" + for fpn_name, fpn_stride in zip(body_feats, self.fpn_stride): + features = body_feats[fpn_name] + scale = fluid.layers.create_parameter( + shape=[1, ], + dtype="float32", + name="%s_scale_on_reg" % fpn_name, + default_initializer=fluid.initializer.Constant(1.)) + cls_pred, bbox_pred, ctn_pred = self._fcos_head( + features, fpn_stride, scale, is_training=is_training) + cls_logits.append(cls_pred) + bboxes_reg.append(bbox_pred) + centerness.append(ctn_pred) + return cls_logits, bboxes_reg, centerness + + def _compute_locations(self, features): + """ + Args: + features (list): List of Variables for FPN feature maps + Return: + Anchor points for each feature map pixel + """ + locations = [] + for lvl, fpn_name in enumerate(features): + feature = features[fpn_name] + shape_fm = fluid.layers.shape(feature) + shape_fm.stop_gradient = True + h = shape_fm[2] + w = shape_fm[3] + fpn_stride = self.fpn_stride[lvl] + shift_x = fluid.layers.range( + 0, w * fpn_stride, fpn_stride, dtype='float32') + shift_y = fluid.layers.range( + 0, h * fpn_stride, fpn_stride, dtype='float32') + shift_x = fluid.layers.unsqueeze(shift_x, axes=[0]) + shift_y = fluid.layers.unsqueeze(shift_y, axes=[1]) + shift_x = fluid.layers.expand_as( + shift_x, target_tensor=feature[0, 0, :, :]) + shift_y = fluid.layers.expand_as( + shift_y, target_tensor=feature[0, 0, :, :]) + shift_x.stop_gradient = True + shift_y.stop_gradient = True + shift_x = fluid.layers.reshape(shift_x, shape=[-1]) + shift_y = fluid.layers.reshape(shift_y, shape=[-1]) + location = fluid.layers.stack( + [shift_x, shift_y], axis=-1) + fpn_stride // 2 + location.stop_gradient = True + locations.append(location) + return locations + + def __merge_hw(self, input, ch_type="channel_first"): + """ + Args: + input (Variables): Feature map whose H and W will be merged into one dimension + ch_type (str): channel_first / channel_last + Return: + new_shape (Variables): The new shape after h and w merged into one dimension + """ + shape_ = fluid.layers.shape(input) + bs = shape_[0] + ch = shape_[1] + hi = shape_[2] + wi = shape_[3] + img_size = hi * wi + img_size.stop_gradient = True + if ch_type == "channel_first": + new_shape = fluid.layers.concat([bs, ch, img_size]) + elif ch_type == "channel_last": + new_shape = fluid.layers.concat([bs, img_size, ch]) + else: + raise KeyError("Wrong ch_type %s" % ch_type) + new_shape.stop_gradient = True + return new_shape + + def _postprocessing_by_level(self, locations, box_cls, box_reg, box_ctn, + im_info): + """ + Args: + locations (Variables): anchor points for current layer + box_cls (Variables): categories prediction + box_reg (Variables): bounding box prediction + box_ctn (Variables): centerness prediction + im_info (Variables): [h, w, scale] for input images + Return: + box_cls_ch_last (Variables): score for each category, in [N, C, M] + C is the number of classes and M is the number of anchor points + box_reg_decoding (Variables): decoded bounding box, in [N, M, 4] + last dimension is [x1, y1, x2, y2] + """ + act_shape_cls = self.__merge_hw(box_cls) + box_cls_ch_last = fluid.layers.reshape( + x=box_cls, + shape=[self.batch_size, self.num_classes, -1], + actual_shape=act_shape_cls) + box_cls_ch_last = fluid.layers.sigmoid(box_cls_ch_last) + act_shape_reg = self.__merge_hw(box_reg, "channel_last") + box_reg_ch_last = fluid.layers.transpose(box_reg, perm=[0, 2, 3, 1]) + box_reg_ch_last = fluid.layers.reshape( + x=box_reg_ch_last, + shape=[self.batch_size, -1, 4], + actual_shape=act_shape_reg) + act_shape_ctn = self.__merge_hw(box_ctn) + box_ctn_ch_last = fluid.layers.reshape( + x=box_ctn, + shape=[self.batch_size, 1, -1], + actual_shape=act_shape_ctn) + box_ctn_ch_last = fluid.layers.sigmoid(box_ctn_ch_last) + + box_reg_decoding = fluid.layers.stack( + [ + locations[:, 0] - box_reg_ch_last[:, :, 0], + locations[:, 1] - box_reg_ch_last[:, :, 1], + locations[:, 0] + box_reg_ch_last[:, :, 2], + locations[:, 1] + box_reg_ch_last[:, :, 3] + ], + axis=1) + box_reg_decoding = fluid.layers.transpose( + box_reg_decoding, perm=[0, 2, 1]) + # recover the location to original image + im_scale = im_info[:, 2] + box_reg_decoding = box_reg_decoding / im_scale + box_cls_ch_last = box_cls_ch_last * box_ctn_ch_last + return box_cls_ch_last, box_reg_decoding + + def _post_processing(self, locations, cls_logits, bboxes_reg, centerness, + im_info): + """ + Args: + locations (list): List of Variables composed by center of each anchor point + cls_logits (list): List of Variables for class prediction + bboxes_reg (list): List of Variables for bounding box prediction + centerness (list): List of Variables for centerness prediction + im_info(Variables): [h, w, scale] for input images + Return: + pred (LoDTensor): predicted bounding box after nms, + the shape is n x 6, last dimension is [label, score, xmin, ymin, xmax, ymax] + """ + pred_boxes_ = [] + pred_scores_ = [] + for _, ( + pts, cls, box, ctn + ) in enumerate(zip(locations, cls_logits, bboxes_reg, centerness)): + pred_scores_lvl, pred_boxes_lvl = self._postprocessing_by_level( + pts, cls, box, ctn, im_info) + pred_boxes_.append(pred_boxes_lvl) + pred_scores_.append(pred_scores_lvl) + pred_boxes = fluid.layers.concat(pred_boxes_, axis=1) + pred_scores = fluid.layers.concat(pred_scores_, axis=2) + pred = self.nms(pred_boxes, pred_scores) + return pred + + def get_loss(self, input, tag_labels, tag_bboxes, tag_centerness): + """ + Calculate the loss for FCOS + Args: + input (list): List of Variables for feature maps from FPN layers + tag_labels (Variables): category targets for each anchor point + tag_bboxes (Variables): bounding boxes targets for positive samples + tag_centerness (Variables): centerness targets for positive samples + Return: + loss (dict): loss composed by classification loss, bounding box + regression loss and centerness regression loss + """ + cls_logits, bboxes_reg, centerness = self._get_output( + input, is_training=True) + loss = self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels, + tag_bboxes, tag_centerness) + return loss + + def get_prediction(self, input, im_info): + """ + Decode the prediction + Args: + input (list): List of Variables for feature maps from FPN layers + im_info(Variables): [h, w, scale] for input images + Return: + the bounding box prediction + """ + cls_logits, bboxes_reg, centerness = self._get_output( + input, is_training=False) + locations = self._compute_locations(input) + pred = self._post_processing(locations, cls_logits, bboxes_reg, + centerness, im_info) + return {"bbox": pred} diff --git a/ppdet/modeling/architectures/__init__.py b/ppdet/modeling/architectures/__init__.py index 7acd147ea..652a38312 100644 --- a/ppdet/modeling/architectures/__init__.py +++ b/ppdet/modeling/architectures/__init__.py @@ -24,6 +24,7 @@ from . import ssd from . import retinanet from . import blazeface from . import faceboxes +from . import fcos from .faster_rcnn import * from .mask_rcnn import * @@ -35,3 +36,4 @@ from .ssd import * from .retinanet import * from .blazeface import * from .faceboxes import * +from .fcos import * diff --git a/ppdet/modeling/architectures/fcos.py b/ppdet/modeling/architectures/fcos.py new file mode 100644 index 000000000..3d5709cfe --- /dev/null +++ b/ppdet/modeling/architectures/fcos.py @@ -0,0 +1,184 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict + +import paddle.fluid as fluid + +from ppdet.experimental import mixed_precision_global_state +from ppdet.core.workspace import register + +__all__ = ['FCOS'] + + +@register +class FCOS(object): + """ + FCOS architecture, see https://arxiv.org/abs/1904.01355 + + Args: + backbone (object): backbone instance + fpn (object): feature pyramid network instance + fcos_head (object): `FCOSHead` instance + """ + + __category__ = 'architecture' + __inject__ = ['backbone', 'fpn', 'fcos_head'] + + def __init__(self, backbone, fpn, fcos_head): + super(FCOS, self).__init__() + self.backbone = backbone + self.fpn = fpn + self.fcos_head = fcos_head + + def build(self, feed_vars, mode='train'): + im = feed_vars['image'] + im_info = feed_vars['im_info'] + + mixed_precision_enabled = mixed_precision_global_state() is not None + # cast inputs to FP16 + if mixed_precision_enabled: + im = fluid.layers.cast(im, 'float16') + + # backbone + body_feats = self.backbone(im) + + # cast features back to FP32 + if mixed_precision_enabled: + body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32')) + for k, v in body_feats.items()) + + # FPN + body_feats, spatial_scale = self.fpn.get_output(body_feats) + + # fcosnet head + if mode == 'train': + tag_labels = [] + tag_bboxes = [] + tag_centerness = [] + for i in range(len(self.fcos_head.fpn_stride)): + # reg_target, labels, scores, centerness + k_lbl = 'labels{}'.format(i) + if k_lbl in feed_vars: + tag_labels.append(feed_vars[k_lbl]) + k_box = 'reg_target{}'.format(i) + if k_box in feed_vars: + tag_bboxes.append(feed_vars[k_box]) + k_ctn = 'centerness{}'.format(i) + if k_ctn in feed_vars: + tag_centerness.append(feed_vars[k_ctn]) + # tag_labels, tag_bboxes, tag_centerness + loss = self.fcos_head.get_loss(body_feats, tag_labels, tag_bboxes, + tag_centerness) + total_loss = fluid.layers.sum(list(loss.values())) + loss.update({'loss': total_loss}) + return loss + else: + pred = self.fcos_head.get_prediction(body_feats, im_info) + return pred + + def _inputs_def(self, image_shape, fields): + im_shape = [None] + image_shape + # yapf: disable + inputs_def = { + 'image': {'shape': im_shape, 'dtype': 'float32', 'lod_level': 0}, + 'im_shape': {'shape': [None, 3], 'dtype': 'float32', 'lod_level': 0}, + 'im_info': {'shape': [None, 3], 'dtype': 'float32', 'lod_level': 0}, + 'im_id': {'shape': [None, 1], 'dtype': 'int64', 'lod_level': 0}, + 'gt_bbox': {'shape': [None, 4], 'dtype': 'float32', 'lod_level': 1}, + 'gt_class': {'shape': [None, 1], 'dtype': 'int32', 'lod_level': 1}, + 'gt_score': {'shape': [None, 1], 'dtype': 'float32', 'lod_level': 1}, + 'is_crowd': {'shape': [None, 1], 'dtype': 'int32', 'lod_level': 1}, + 'is_difficult': {'shape': [None, 1], 'dtype': 'int32', 'lod_level': 1} + } + # yapf: disable + if 'gt_bbox' in fields: + targets_def = { + 'labels0': {'shape': [None, None, None, 1], 'dtype': 'int32', 'lod_level': 0}, + 'reg_target0': {'shape': [None, None, None, 4], 'dtype': 'float32', 'lod_level': 0}, + 'centerness0': {'shape': [None, None, None, 1], 'dtype': 'float32', 'lod_level': 0}, + 'labels1': {'shape': [None, None, None, 1], 'dtype': 'int32', 'lod_level': 0}, + 'reg_target1': {'shape': [None, None, None, 4], 'dtype': 'float32', 'lod_level': 0}, + 'centerness1': {'shape': [None, None, None, 1], 'dtype': 'float32', 'lod_level': 0}, + 'labels2': {'shape': [None, None, None, 1], 'dtype': 'int32', 'lod_level': 0}, + 'reg_target2': {'shape': [None, None, None, 4], 'dtype': 'float32', 'lod_level': 0}, + 'centerness2': {'shape': [None, None, None, 1], 'dtype': 'float32', 'lod_level': 0}, + 'labels3': {'shape': [None, None, None, 1], 'dtype': 'int32', 'lod_level': 0}, + 'reg_target3': {'shape': [None, None, None, 4], 'dtype': 'float32', 'lod_level': 0}, + 'centerness3': {'shape': [None, None, None, 1], 'dtype': 'float32', 'lod_level': 0}, + 'labels4': {'shape': [None, None, None, 1], 'dtype': 'int32', 'lod_level': 0}, + 'reg_target4': {'shape': [None, None, None, 4], 'dtype': 'float32', 'lod_level': 0}, + 'centerness4': {'shape': [None, None, None, 1], 'dtype': 'float32', 'lod_level': 0}, + } + # yapf: enable + + # downsample = 128 + for k, stride in enumerate(self.fcos_head.fpn_stride): + k_lbl = 'labels{}'.format(k) + k_box = 'reg_target{}'.format(k) + k_ctn = 'centerness{}'.format(k) + grid_y = image_shape[-2] // stride if image_shape[-2] else None + grid_x = image_shape[-1] // stride if image_shape[-1] else None + if grid_x is not None: + num_pts = grid_x * grid_y + num_dim2 = 1 + else: + num_pts = None + num_dim2 = None + targets_def[k_lbl]['shape'][1] = num_pts + targets_def[k_box]['shape'][1] = num_pts + targets_def[k_ctn]['shape'][1] = num_pts + targets_def[k_lbl]['shape'][2] = num_dim2 + targets_def[k_box]['shape'][2] = num_dim2 + targets_def[k_ctn]['shape'][2] = num_dim2 + inputs_def.update(targets_def) + return inputs_def + + def build_inputs( + self, + image_shape=[3, None, None], + fields=[ + 'image', 'im_shape', 'im_id', 'gt_bbox', 'gt_class', 'is_crowd' + ], # for-train + use_dataloader=True, + iterable=False): + inputs_def = self._inputs_def(image_shape, fields) + if "gt_bbox" in fields: + for i in range(len(self.fcos_head.fpn_stride)): + fields.extend( + ['labels%d' % i, 'reg_target%d' % i, 'centerness%d' % i]) + feed_vars = OrderedDict([(key, fluid.layers.data( + name=key, + shape=inputs_def[key]['shape'], + dtype=inputs_def[key]['dtype'], + lod_level=inputs_def[key]['lod_level'])) for key in fields]) + loader = fluid.io.DataLoader.from_generator( + feed_list=list(feed_vars.values()), + capacity=16, + use_double_buffer=True, + iterable=iterable) if use_dataloader else None + return feed_vars, loader + + def train(self, feed_vars): + return self.build(feed_vars, 'train') + + def eval(self, feed_vars): + return self.build(feed_vars, 'test') + + def test(self, feed_vars): + return self.build(feed_vars, 'test') diff --git a/ppdet/modeling/backbones/fpn.py b/ppdet/modeling/backbones/fpn.py index 9bd491a66..2eefd3158 100644 --- a/ppdet/modeling/backbones/fpn.py +++ b/ppdet/modeling/backbones/fpn.py @@ -51,7 +51,8 @@ class FPN(object): spatial_scale=[1. / 32., 1. / 16., 1. / 8., 1. / 4.], has_extra_convs=False, norm_type=None, - freeze_norm=False): + freeze_norm=False, + use_c5=True): self.freeze_norm = freeze_norm self.num_chan = num_chan self.min_level = min_level @@ -59,6 +60,7 @@ class FPN(object): self.spatial_scale = spatial_scale self.has_extra_convs = has_extra_convs self.norm_type = norm_type + self.use_c5 = use_c5 def _add_topdown_lateral(self, body_name, body_input, upper_output): lateral_name = 'fpn_inner_' + body_name + '_lateral' @@ -189,7 +191,10 @@ class FPN(object): # Coarser FPN levels introduced for RetinaNet highest_backbone_level = self.min_level + len(spatial_scale) - 1 if self.has_extra_convs and self.max_level > highest_backbone_level: - fpn_blob = body_dict[body_name_list[0]] + if self.use_c5: + fpn_blob = body_dict[body_name_list[0]] + else: + fpn_blob = fpn_dict[fpn_name_list[0]] for i in range(highest_backbone_level + 1, self.max_level + 1): fpn_blob_in = fpn_blob fpn_name = 'fpn_' + str(i) diff --git a/ppdet/modeling/losses/__init__.py b/ppdet/modeling/losses/__init__.py index eabac9bf8..1c806ad89 100644 --- a/ppdet/modeling/losses/__init__.py +++ b/ppdet/modeling/losses/__init__.py @@ -20,6 +20,7 @@ from . import giou_loss from . import diou_loss from . import iou_loss from . import balanced_l1_loss +from . import fcos_loss from .yolo_loss import * from .smooth_l1_loss import * @@ -27,3 +28,4 @@ from .giou_loss import * from .diou_loss import * from .iou_loss import * from .balanced_l1_loss import * +from .fcos_loss import * diff --git a/ppdet/modeling/losses/fcos_loss.py b/ppdet/modeling/losses/fcos_loss.py new file mode 100644 index 000000000..b1a5e597b --- /dev/null +++ b/ppdet/modeling/losses/fcos_loss.py @@ -0,0 +1,205 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer +from ppdet.core.workspace import register, serializable + +INF = 1e8 +__all__ = ['FCOSLoss'] + + +@register +@serializable +class FCOSLoss(object): + """ + FCOSLoss + Args: + loss_alpha (float): alpha in focal loss + loss_gamma (float): gamma in focal loss + iou_loss_type(str): location loss type, IoU/GIoU/LINEAR_IoU + reg_weights(float): weight for location loss + """ + + def __init__(self, + loss_alpha=0.25, + loss_gamma=2.0, + iou_loss_type="IoU", + reg_weights=1.0): + self.loss_alpha = loss_alpha + self.loss_gamma = loss_gamma + self.iou_loss_type = iou_loss_type + self.reg_weights = reg_weights + + def __flatten_tensor(self, input, channel_first=False): + """ + Flatten a Tensor + Args: + input (Variables): Input Tensor + channel_first(bool): if true the dimension order of + Tensor is [N, C, H, W], otherwise is [N, H, W, C] + Return: + input_channel_last (Variables): The flattened Tensor in channel_last style + """ + if channel_first: + input_channel_last = fluid.layers.transpose( + input, perm=[0, 2, 3, 1]) + else: + input_channel_last = input + input_channel_last = fluid.layers.flatten(input_channel_last, axis=3) + return input_channel_last + + def __iou_loss(self, pred, targets, positive_mask, weights=None): + """ + Calculate the loss for location prediction + Args: + pred (Variables): bounding boxes prediction + targets (Variables): targets for positive samples + positive_mask (Variables): mask of positive samples + weights (Variables): weights for each positive samples + Return: + loss (Varialbes): location loss + """ + plw = pred[:, 0] * positive_mask + pth = pred[:, 1] * positive_mask + prw = pred[:, 2] * positive_mask + pbh = pred[:, 3] * positive_mask + tlw = targets[:, 0] * positive_mask + tth = targets[:, 1] * positive_mask + trw = targets[:, 2] * positive_mask + tbh = targets[:, 3] * positive_mask + tlw.stop_gradient = True + trw.stop_gradient = True + tth.stop_gradient = True + tbh.stop_gradient = True + area_target = (tlw + trw) * (tth + tbh) + area_predict = (plw + prw) * (pth + pbh) + ilw = fluid.layers.elementwise_min(plw, tlw) + irw = fluid.layers.elementwise_min(prw, trw) + ith = fluid.layers.elementwise_min(pth, tth) + ibh = fluid.layers.elementwise_min(pbh, tbh) + clw = fluid.layers.elementwise_max(plw, tlw) + crw = fluid.layers.elementwise_max(prw, trw) + cth = fluid.layers.elementwise_max(pth, tth) + cbh = fluid.layers.elementwise_max(pbh, tbh) + area_inter = (ilw + irw) * (ith + ibh) + ious = (area_inter + 1.0) / ( + area_predict + area_target - area_inter + 1.0) + ious = ious * positive_mask + if self.iou_loss_type.lower() == "linear_iou": + loss = 1.0 - ious + elif self.iou_loss_type.lower() == "giou": + area_uniou = area_predict + area_target - area_inter + area_circum = (clw + crw) * (cth + cbh) + 1e-7 + giou = ious - (area_circum - area_uniou) / area_circum + loss = 1.0 - giou + elif self.iou_loss_type.lower() == "iou": + loss = 0.0 - fluid.layers.log(ious) + else: + raise KeyError + if weights is not None: + loss = loss * weights + return loss + + def __call__(self, cls_logits, bboxes_reg, centerness, tag_labels, + tag_bboxes, tag_center): + """ + Calculate the loss for classification, location and centerness + Args: + cls_logits (list): list of Variables, which is predicted + score for all anchor points with shape [N, M, C] + bboxes_reg (list): list of Variables, which is predicted + offsets for all anchor points with shape [N, M, 4] + centerness (list): list of Variables, which is predicted + centerness for all anchor points with shape [N, M, 1] + tag_labels (list): list of Variables, which is category + targets for each anchor point + tag_bboxes (list): list of Variables, which is bounding + boxes targets for positive samples + tag_center (list): list of Variables, which is centerness + targets for positive samples + Return: + loss (dict): loss composed by classification loss, bounding box + """ + cls_logits_flatten_list = [] + bboxes_reg_flatten_list = [] + centerness_flatten_list = [] + tag_labels_flatten_list = [] + tag_bboxes_flatten_list = [] + tag_center_flatten_list = [] + num_lvl = len(cls_logits) + for lvl in range(num_lvl): + cls_logits_flatten_list.append( + self.__flatten_tensor(cls_logits[num_lvl - 1 - lvl], True)) + bboxes_reg_flatten_list.append( + self.__flatten_tensor(bboxes_reg[num_lvl - 1 - lvl], True)) + centerness_flatten_list.append( + self.__flatten_tensor(centerness[num_lvl - 1 - lvl], True)) + tag_labels_flatten_list.append( + self.__flatten_tensor(tag_labels[lvl], False)) + tag_bboxes_flatten_list.append( + self.__flatten_tensor(tag_bboxes[lvl], False)) + tag_center_flatten_list.append( + self.__flatten_tensor(tag_center[lvl], False)) + + cls_logits_flatten = fluid.layers.concat( + cls_logits_flatten_list, axis=0) + bboxes_reg_flatten = fluid.layers.concat( + bboxes_reg_flatten_list, axis=0) + centerness_flatten = fluid.layers.concat( + centerness_flatten_list, axis=0) + tag_labels_flatten = fluid.layers.concat( + tag_labels_flatten_list, axis=0) + tag_bboxes_flatten = fluid.layers.concat( + tag_bboxes_flatten_list, axis=0) + tag_center_flatten = fluid.layers.concat( + tag_center_flatten_list, axis=0) + tag_labels_flatten.stop_gradient = True + tag_bboxes_flatten.stop_gradient = True + tag_center_flatten.stop_gradient = True + + mask_positive = tag_labels_flatten > 0 + mask_positive.stop_gradient = True + mask_positive_float = fluid.layers.cast(mask_positive, dtype="float32") + mask_positive_float.stop_gradient = True + num_positive_fp32 = fluid.layers.reduce_sum(mask_positive_float) + num_positive_int32 = fluid.layers.cast(num_positive_fp32, dtype="int32") + num_positive_int32 = num_positive_int32 * 0 + 1 + num_positive_fp32.stop_gradient = True + num_positive_int32.stop_gradient = True + normalize_sum = fluid.layers.sum(tag_center_flatten) + normalize_sum.stop_gradient = True + normalize_sum = fluid.layers.reduce_sum(mask_positive_float * + normalize_sum) + normalize_sum.stop_gradient = True + cls_loss = fluid.layers.sigmoid_focal_loss( + cls_logits_flatten, tag_labels_flatten, + num_positive_int32) / num_positive_fp32 + reg_loss = self.__iou_loss( + bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, + tag_center_flatten) * mask_positive_float / normalize_sum + ctn_loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=centerness_flatten, + label=tag_center_flatten) * mask_positive_float / num_positive_fp32 + loss_all = { + "loss_centerness": fluid.layers.reduce_sum(ctn_loss), + "loss_cls": fluid.layers.reduce_sum(cls_loss), + "loss_box": fluid.layers.reduce_sum(reg_loss) + } + return loss_all diff --git a/ppdet/modeling/losses/iou_loss.py b/ppdet/modeling/losses/iou_loss.py index 39cb1b9b1..498ae607a 100644 --- a/ppdet/modeling/losses/iou_loss.py +++ b/ppdet/modeling/losses/iou_loss.py @@ -77,11 +77,6 @@ class IouLoss(object): xkis2 = fluid.layers.elementwise_min(x2, x2g) ykis2 = fluid.layers.elementwise_min(y2, y2g) - xc1 = fluid.layers.elementwise_min(x1, x1g) - yc1 = fluid.layers.elementwise_min(y1, y1g) - xc2 = fluid.layers.elementwise_max(x2, x2g) - yc2 = fluid.layers.elementwise_max(y2, y2g) - intsctk = (xkis2 - xkis1) * (ykis2 - ykis1) intsctk = intsctk * fluid.layers.greater_than( xkis2, xkis1) * fluid.layers.greater_than(ykis2, ykis1) diff --git a/ppdet/modeling/ops.py b/ppdet/modeling/ops.py index bcbe50918..28abd477c 100644 --- a/ppdet/modeling/ops.py +++ b/ppdet/modeling/ops.py @@ -46,8 +46,16 @@ def ConvNorm(input, act=None, norm_name=None, initializer=None, + bias_attr=False, name=None): fan = num_filters + if bias_attr: + bias_para = ParamAttr( + name=name + "_bias", + initializer=fluid.initializer.Constant(value=0), + learning_rate=lr_scale * 2) + else: + bias_para = False conv = fluid.layers.conv2d( input=input, num_filters=num_filters, @@ -61,7 +69,7 @@ def ConvNorm(input, name=name + "_weights", initializer=initializer, learning_rate=lr_scale), - bias_attr=False, + bias_attr=bias_para, name=name + '.conv2d.output.1') norm_lr = 0. if freeze_norm else 1. -- GitLab