# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr from paddle.fluid.initializer import TruncatedNormal, Constant from paddle.fluid.regularizer import L2Decay from ppdet.modeling.ops import RetinaOutputDecoder from ppdet.core.workspace import register __all__ = ['EfficientHead'] @register class EfficientHead(object): """ EfficientDet Head Args: output_decoder (object): `RetinaOutputDecoder` instance. repeat (int): Number of convolution layers. num_chan (int): Number of octave output channels. prior_prob (float): Initial value of the class prediction layer bias. num_anchors (int): Number of anchors per cell. num_classes (int): Number of classes. gamma (float): Gamma parameter for focal loss. alpha (float): Alpha parameter for focal loss. sigma (float): Sigma parameter for smooth l1 loss. """ __inject__ = ['output_decoder'] __shared__ = ['num_classes'] def __init__(self, output_decoder=RetinaOutputDecoder().__dict__, repeat=3, num_chan=64, prior_prob=0.01, num_anchors=9, num_classes=81, gamma=1.5, alpha=0.25, delta=0.1): super(EfficientHead, self).__init__() self.output_decoder = output_decoder self.repeat = repeat self.num_chan = num_chan self.prior_prob = prior_prob self.num_anchors = num_anchors self.num_classes = num_classes self.gamma = gamma self.alpha = alpha self.delta = delta if isinstance(output_decoder, dict): self.output_decoder = RetinaOutputDecoder(**output_decoder) def _get_output(self, body_feats): def separable_conv(inputs, num_chan, bias_init=None, name=''): dw_conv_name = name + '_dw' pw_conv_name = name + '_pw' in_chan = inputs.shape[1] fan_in = np.sqrt(1. / (in_chan * 3 * 3)) feat = fluid.layers.conv2d( input=inputs, num_filters=in_chan, groups=in_chan, filter_size=3, stride=1, padding='SAME', param_attr=ParamAttr( name=dw_conv_name + '_w', initializer=TruncatedNormal(scale=fan_in)), bias_attr=False) fan_in = np.sqrt(1. / in_chan) feat = fluid.layers.conv2d( input=feat, num_filters=num_chan, filter_size=1, stride=1, param_attr=ParamAttr( name=pw_conv_name + '_w', initializer=TruncatedNormal(scale=fan_in)), bias_attr=ParamAttr( name=pw_conv_name + '_b', initializer=bias_init, regularizer=L2Decay(0.))) return feat def subnet(inputs, prefix, level): feat = inputs for i in range(self.repeat): # NOTE share weight across FPN levels conv_name = '{}_pred_conv_{}'.format(prefix, i) feat = separable_conv(feat, self.num_chan, name=conv_name) # NOTE batch norm params are not shared bn_name = '{}_pred_bn_{}_{}'.format(prefix, level, i) feat = fluid.layers.batch_norm( input=feat, act='swish', momentum=0.997, epsilon=1e-4, moving_mean_name=bn_name + '_mean', moving_variance_name=bn_name + '_variance', param_attr=ParamAttr( name=bn_name + '_w', initializer=Constant(value=1.), regularizer=L2Decay(0.)), bias_attr=ParamAttr( name=bn_name + '_b', regularizer=L2Decay(0.))) return feat cls_preds = [] box_preds = [] for l, feat in enumerate(body_feats): cls_out = subnet(feat, 'cls', l) box_out = subnet(feat, 'box', l) bias_init = float(-np.log((1 - self.prior_prob) / self.prior_prob)) bias_init = Constant(value=bias_init) cls_pred = separable_conv( cls_out, self.num_anchors * (self.num_classes - 1), bias_init=bias_init, name='cls_pred') cls_pred = fluid.layers.transpose(cls_pred, perm=[0, 2, 3, 1]) cls_pred = fluid.layers.reshape( cls_pred, shape=(0, -1, self.num_classes - 1)) cls_preds.append(cls_pred) box_pred = separable_conv( box_out, self.num_anchors * 4, name='box_pred') box_pred = fluid.layers.transpose(box_pred, perm=[0, 2, 3, 1]) box_pred = fluid.layers.reshape(box_pred, shape=(0, -1, 4)) box_preds.append(box_pred) return cls_preds, box_preds def get_prediction(self, body_feats, anchors, im_info): cls_preds, box_preds = self._get_output(body_feats) cls_preds = [fluid.layers.sigmoid(pred) for pred in cls_preds] pred_result = self.output_decoder( bboxes=box_preds, scores=cls_preds, anchors=anchors, im_info=im_info) return {'bbox': pred_result} def get_loss(self, body_feats, gt_labels, gt_targets, fg_num): cls_preds, box_preds = self._get_output(body_feats) fg_num = fluid.layers.reduce_sum(fg_num, name='fg_num') fg_num.stop_gradient = True cls_pred = fluid.layers.concat(cls_preds, axis=1) box_pred = fluid.layers.concat(box_preds, axis=1) cls_pred_reshape = fluid.layers.reshape( cls_pred, shape=(-1, self.num_classes - 1)) gt_labels_reshape = fluid.layers.reshape(gt_labels, shape=(-1, 1)) loss_cls = fluid.layers.sigmoid_focal_loss( x=cls_pred_reshape, label=gt_labels_reshape, fg_num=fg_num, gamma=self.gamma, alpha=self.alpha) loss_cls = fluid.layers.reduce_sum(loss_cls) loss_bbox = fluid.layers.huber_loss( input=box_pred, label=gt_targets, delta=self.delta) mask = fluid.layers.expand(gt_labels, expand_times=[1, 1, 4]) > 0 loss_bbox *= fluid.layers.cast(mask, 'float32') loss_bbox = fluid.layers.reduce_sum(loss_bbox) / (fg_num * 4) return {'loss_cls': loss_cls, 'loss_bbox': loss_bbox}