From 2e418109636d175b0c902f7ef9900d0e2c1d29c1 Mon Sep 17 00:00:00 2001 From: littletomatodonkey <2120160898@bit.edu.cn> Date: Mon, 20 Apr 2020 15:32:34 +0800 Subject: [PATCH] Add mobile-side rcnn models (#488) * add moile-side rcnn server model --- configs/mobile_side/README.md | 17 ++ .../cascade_rcnn_mobilenetv3_fpn_320.yml | 219 +++++++++++++ .../cascade_rcnn_mobilenetv3_fpn_640.yml | 219 +++++++++++++ ppdet/modeling/backbones/mobilenet_v3.py | 289 ++++++++++++++++-- 4 files changed, 711 insertions(+), 33 deletions(-) create mode 100755 configs/mobile_side/README.md create mode 100644 configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_320.yml create mode 100644 configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_640.yml diff --git a/configs/mobile_side/README.md b/configs/mobile_side/README.md new file mode 100755 index 000000000..88ffe10a8 --- /dev/null +++ b/configs/mobile_side/README.md @@ -0,0 +1,17 @@ +# Practical Mobile-side detection method base on RCNN + +## Introduction + +* This is developed by PaddleDetection. Many useful tricks are utilized for the model training process. More details can be seen in the configuration file. +* The inerence is tested on Qualcomm Snapdragon 845 Mobile Platform. + + +## Model Zoo + +| Backbone | Type | Image/gpu | Lr schd | Inf time on SD845 (fps) | Box AP | Mask AP | Download | +| :---------------------- | :-------------: | :-------: | :-----: | :------------: | :----: | :-----: | :----------------------------------------------------------: | +| MobileNetV3-vd-FPN | Cascade Faster | 2 | 5.6x(CosineDecay) | 8.13 | 25.0 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_mobilenetv3_fpn_320.tar) | +| MobileNetV3-vd-FPN | Cascade Faster | 2 | 5.6x(CosineDecay) | 2.66 | 30.2 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/cascade_rcnn_mobilenetv3_fpn_640.tar) | + +**note** +* `5.6x` means the model is trained with `50000` minibatches 8 GPU cards(batch size=2 for each card). diff --git a/configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_320.yml b/configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_320.yml new file mode 100644 index 000000000..ce0c0b9f0 --- /dev/null +++ b/configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_320.yml @@ -0,0 +1,219 @@ +architecture: CascadeRCNN +max_iters: 500000 +snapshot_iter: 50000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_large_x1_0_ssld_pretrained.tar +weights: output/cascade_rcnn_mobilenetv3_fpn_320/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: MobileNetV3RCNN + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +MobileNetV3RCNN: + norm_type: bn + freeze_norm: true + norm_decay: 0.0 + feature_maps: [2, 3, 4] + conv_decay: 0.00001 + lr_mult_list: [0.25, 0.25, 0.5, 0.5, 0.75] + scale: 1.0 + model_name: large + +FPN: + min_level: 2 + max_level: 6 + num_chan: 48 + has_extra_convs: true + spatial_scale: [0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 16 + min_level: 2 + max_level: 6 + num_chan: 48 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 300 + post_nms_top_n: 100 + +FPNRoIAlign: + canconical_level: 3 + canonical_size: 112 + min_level: 2 + max_level: 4 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + bbox_loss: BalancedL1Loss + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +BalancedL1Loss: + alpha: 0.5 + gamma: 1.5 + beta: 1.0 + loss_weight: 1.0 + +CascadeTwoFCHead: + mlp_dim: 128 + +LearningRate: + base_lr: 0.02 + schedulers: + - !CosineDecay + max_iters: 500000 + - !LinearWarmup + start_factor: 0.1 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.00004 + type: L2 + +TrainReader: + inputs_def: + fields: ['image', 'im_info', 'im_id', 'gt_bbox', 'gt_class', 'is_crowd'] + dataset: + !COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + sample_transforms: + - !DecodeImage + to_rgb: true + - !RandomFlipImage + prob: 0.5 + - !AutoAugmentImage + autoaug_type: v1 + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + target_size: [224, 256, 288, 320, 352, 384] + max_size: 512 + interp: 1 + use_cv2: true + - !Permute + to_bgr: false + channel_first: true + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: false + batch_size: 2 + shuffle: true + worker_num: 2 + use_process: false + + +TestReader: + inputs_def: + # set image_shape if needed + fields: ['image', 'im_info', 'im_id', 'im_shape'] + dataset: + !ImageFolder + anno_path: annotations/instances_val2017.json + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + interp: 1 + max_size: 320 + target_size: 320 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: true + batch_size: 1 + shuffle: false + + + +EvalReader: + inputs_def: + fields: ['image', 'im_info', 'im_id', 'im_shape'] + # for voc + #fields: ['image', 'im_info', 'im_id', 'gt_bbox', 'gt_class', 'is_difficult'] + dataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + interp: 1 + max_size: 320 + target_size: 320 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: true + batch_size: 1 + shuffle: false + drop_empty: false + worker_num: 2 diff --git a/configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_640.yml b/configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_640.yml new file mode 100644 index 000000000..54e7a7f3e --- /dev/null +++ b/configs/mobile_side/cascade_rcnn_mobilenetv3_fpn_640.yml @@ -0,0 +1,219 @@ +architecture: CascadeRCNN +max_iters: 500000 +snapshot_iter: 50000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/MobileNetV3_large_x1_0_ssld_pretrained.tar +weights: output/cascade_rcnn_mobilenetv3_fpn_640/model_final +metric: COCO +num_classes: 81 + +CascadeRCNN: + backbone: MobileNetV3RCNN + fpn: FPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: CascadeBBoxHead + bbox_assigner: CascadeBBoxAssigner + +MobileNetV3RCNN: + norm_type: bn + freeze_norm: true + norm_decay: 0.0 + feature_maps: [2, 3, 4] + conv_decay: 0.00001 + lr_mult_list: [1.0, 1.0, 1.0, 1.0, 1.0] + scale: 1.0 + model_name: large + +FPN: + min_level: 2 + max_level: 6 + num_chan: 48 + has_extra_convs: true + spatial_scale: [0.0625, 0.125, 0.25] + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 24 + min_level: 2 + max_level: 6 + num_chan: 48 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_positive_overlap: 0.7 + rpn_negative_overlap: 0.3 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 2000 + post_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + pre_nms_top_n: 300 + post_nms_top_n: 100 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + min_level: 2 + max_level: 5 + box_resolution: 7 + sampling_ratio: 2 + +CascadeBBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [10, 20, 30] + bg_thresh_lo: [0.0, 0.0, 0.0] + bg_thresh_hi: [0.5, 0.6, 0.7] + fg_thresh: [0.5, 0.6, 0.7] + fg_fraction: 0.25 + +CascadeBBoxHead: + head: CascadeTwoFCHead + bbox_loss: BalancedL1Loss + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +BalancedL1Loss: + alpha: 0.5 + gamma: 1.5 + beta: 1.0 + loss_weight: 1.0 + +CascadeTwoFCHead: + mlp_dim: 128 + +LearningRate: + base_lr: 0.01 + schedulers: + - !CosineDecay + max_iters: 500000 + - !LinearWarmup + start_factor: 0.1 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.00004 + type: L2 + +TrainReader: + inputs_def: + fields: ['image', 'im_info', 'im_id', 'gt_bbox', 'gt_class', 'is_crowd'] + dataset: + !COCODataSet + image_dir: train2017 + anno_path: annotations/instances_train2017.json + dataset_dir: dataset/coco + sample_transforms: + - !DecodeImage + to_rgb: true + - !RandomFlipImage + prob: 0.5 + - !AutoAugmentImage + autoaug_type: v1 + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + target_size: [416, 448, 480, 512, 544, 576, 608, 640, 672] + max_size: 1000 + interp: 1 + use_cv2: true + - !Permute + to_bgr: false + channel_first: true + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: false + batch_size: 2 + shuffle: true + worker_num: 2 + use_process: false + + +TestReader: + inputs_def: + # set image_shape if needed + fields: ['image', 'im_info', 'im_id', 'im_shape'] + dataset: + !ImageFolder + anno_path: annotations/instances_val2017.json + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + interp: 1 + max_size: 640 + target_size: 640 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: true + batch_size: 1 + shuffle: false + + + +EvalReader: + inputs_def: + fields: ['image', 'im_info', 'im_id', 'im_shape'] + # for voc + #fields: ['image', 'im_info', 'im_id', 'gt_bbox', 'gt_class', 'is_difficult'] + dataset: + !COCODataSet + image_dir: val2017 + anno_path: annotations/instances_val2017.json + dataset_dir: dataset/coco + sample_transforms: + - !DecodeImage + to_rgb: true + with_mixup: false + - !NormalizeImage + is_channel_first: false + is_scale: true + mean: [0.485,0.456,0.406] + std: [0.229, 0.224,0.225] + - !ResizeImage + interp: 1 + max_size: 640 + target_size: 640 + use_cv2: true + - !Permute + channel_first: true + to_bgr: false + batch_transforms: + - !PadBatch + pad_to_stride: 32 + use_padded_im_info: true + batch_size: 1 + shuffle: false + drop_empty: false + worker_num: 2 diff --git a/ppdet/modeling/backbones/mobilenet_v3.py b/ppdet/modeling/backbones/mobilenet_v3.py index dfaed2edb..d48731af3 100644 --- a/ppdet/modeling/backbones/mobilenet_v3.py +++ b/ppdet/modeling/backbones/mobilenet_v3.py @@ -22,14 +22,18 @@ import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr from paddle.fluid.regularizer import L2Decay +import math +import numpy as np +from collections import OrderedDict + from ppdet.core.workspace import register from numbers import Integral -__all__ = ['MobileNetV3'] +__all__ = ['MobileNetV3', 'MobileNetV3RCNN'] @register -class MobileNetV3(): +class MobileNetV3(object): """ MobileNet v3, see https://arxiv.org/abs/1905.02244 Args: @@ -40,18 +44,25 @@ class MobileNetV3(): conv_decay (float): weight decay for convolution layer weights. feature_maps (list): index of stages whose feature maps are returned. extra_block_filters (list): number of filter for each extra block. + lr_mult_list (list): learning rate ratio of different blocks, lower learning rate ratio + is need for pretrained model got using distillation(default as + [1.0, 1.0, 1.0, 1.0, 1.0]). + freeze_norm (bool): freeze normalization layers + feature_maps (list): feature maps used in two-stage rcnn models(default as None). """ __shared__ = ['norm_type'] - def __init__(self, - scale=1.0, - model_name='small', - feature_maps=[5, 6, 7, 8, 9, 10], - conv_decay=0.0, - norm_type='bn', - norm_decay=0.0, - extra_block_filters=[[256, 512], [128, 256], [128, 256], - [64, 128]]): + def __init__( + self, + scale=1.0, + model_name='small', + feature_maps=[5, 6, 7, 8, 9, 10], + conv_decay=0.0, + norm_type='bn', + norm_decay=0.0, + extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + freeze_norm=False, ): if isinstance(feature_maps, Integral): feature_maps = [feature_maps] @@ -64,6 +75,12 @@ class MobileNetV3(): self.inplanes = 16 self.end_points = [] self.block_stride = 0 + + self.lr_mult_list = lr_mult_list + self.freeze_norm = freeze_norm + self.norm_type = norm_type + self.curr_stage = 0 + if model_name == "large": self.cfg = [ # kernel_size, expand, channel, se_block, act_mode, stride @@ -83,6 +100,8 @@ class MobileNetV3(): [5, 960, 160, True, 'hard_swish', 1], [5, 960, 160, True, 'hard_swish', 1], ] + self.cls_ch_squeeze = 960 + self.cls_ch_expand = 1280 elif model_name == "small": self.cfg = [ # kernel_size, expand, channel, se_block, act_mode, stride @@ -98,6 +117,8 @@ class MobileNetV3(): [5, 576, 96, True, 'hard_swish', 1], [5, 576, 96, True, 'hard_swish', 1], ] + self.cls_ch_squeeze = 576 + self.cls_ch_expand = 1280 else: raise NotImplementedError @@ -112,8 +133,9 @@ class MobileNetV3(): act=None, name=None, use_cudnn=True): - conv_param_attr = ParamAttr( - name=name + '_weights', regularizer=L2Decay(self.conv_decay)) + lr_idx = self.curr_stage // 3 + lr_idx = min(lr_idx, len(self.lr_mult_list) - 1) + lr_mult = self.lr_mult_list[lr_idx] conv = fluid.layers.conv2d( input=input, num_filters=num_filters, @@ -123,19 +145,14 @@ class MobileNetV3(): groups=num_groups, act=None, use_cudnn=use_cudnn, - param_attr=conv_param_attr, + param_attr=ParamAttr( + name=name + '_weights', + learning_rate=lr_mult, + regularizer=L2Decay(self.conv_decay)), bias_attr=False) bn_name = name + '_bn' - bn_param_attr = ParamAttr( - name=bn_name + "_scale", regularizer=L2Decay(self.norm_decay)) - bn_bias_attr = ParamAttr( - name=bn_name + "_offset", regularizer=L2Decay(self.norm_decay)) - bn = fluid.layers.batch_norm( - input=conv, - param_attr=bn_param_attr, - bias_attr=bn_bias_attr, - moving_mean_name=bn_name + '_mean', - moving_variance_name=bn_name + '_variance') + bn = self._bn(conv, bn_name=bn_name) + if if_act: if act == 'relu': bn = fluid.layers.relu(bn) @@ -145,10 +162,64 @@ class MobileNetV3(): bn = fluid.layers.relu6(bn) return bn + def _bn(self, input, act=None, bn_name=None): + lr_idx = self.curr_stage // 3 + lr_idx = min(lr_idx, len(self.lr_mult_list) - 1) + lr_mult = self.lr_mult_list[lr_idx] + norm_lr = 0. if self.freeze_norm else lr_mult + norm_decay = self.norm_decay + pattr = ParamAttr( + name=bn_name + '_scale', + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay)) + battr = ParamAttr( + name=bn_name + '_offset', + learning_rate=norm_lr, + regularizer=L2Decay(norm_decay)) + + conv = input + + if self.norm_type in ['bn', 'sync_bn']: + global_stats = True if self.freeze_norm else False + out = fluid.layers.batch_norm( + input=conv, + act=act, + name=bn_name + '.output.1', + param_attr=pattr, + bias_attr=battr, + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance', + use_global_stats=global_stats) + scale = fluid.framework._get_var(pattr.name) + bias = fluid.framework._get_var(battr.name) + elif self.norm_type == 'affine_channel': + scale = fluid.layers.create_parameter( + shape=[conv.shape[1]], + dtype=conv.dtype, + attr=pattr, + default_initializer=fluid.initializer.Constant(1.)) + bias = fluid.layers.create_parameter( + shape=[conv.shape[1]], + dtype=conv.dtype, + attr=battr, + default_initializer=fluid.initializer.Constant(0.)) + out = fluid.layers.affine_channel( + x=conv, scale=scale, bias=bias, act=act) + + if self.freeze_norm: + scale.stop_gradient = True + bias.stop_gradient = True + + return out + def _hard_swish(self, x): return x * fluid.layers.relu6(x + 3) / 6. def _se_block(self, input, num_out_filter, ratio=4, name=None): + lr_idx = self.curr_stage // 3 + lr_idx = min(lr_idx, len(self.lr_mult_list) - 1) + lr_mult = self.lr_mult_list[lr_idx] + num_mid_filter = int(num_out_filter // ratio) pool = fluid.layers.pool2d( input=input, pool_type='avg', global_pooling=True, use_cudnn=False) @@ -157,15 +228,27 @@ class MobileNetV3(): filter_size=1, num_filters=num_mid_filter, act='relu', - param_attr=ParamAttr(name=name + '_1_weights'), - bias_attr=ParamAttr(name=name + '_1_offset')) + param_attr=ParamAttr( + name=name + '_1_weights', + learning_rate=lr_mult, + regularizer=L2Decay(self.conv_decay)), + bias_attr=ParamAttr( + name=name + '_1_offset', + learning_rate=lr_mult, + regularizer=L2Decay(self.conv_decay))) conv2 = fluid.layers.conv2d( input=conv1, filter_size=1, num_filters=num_out_filter, act='hard_sigmoid', - param_attr=ParamAttr(name=name + '_2_weights'), - bias_attr=ParamAttr(name=name + '_2_offset')) + param_attr=ParamAttr( + name=name + '_2_weights', + learning_rate=lr_mult, + regularizer=L2Decay(self.conv_decay)), + bias_attr=ParamAttr( + name=name + '_2_offset', + learning_rate=lr_mult, + regularizer=L2Decay(self.conv_decay))) scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0) return scale @@ -190,6 +273,7 @@ class MobileNetV3(): if_act=True, act=act, name=name + '_expand') + if self.block_stride == 4 and stride == 2: self.block_stride += 1 if self.block_stride in self.feature_maps: @@ -258,6 +342,14 @@ class MobileNetV3(): name=name + "_extra2_sep") return normal_conv + def _make_divisible(self, v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + def __call__(self, input): scale = self.scale inplanes = self.inplanes @@ -268,7 +360,7 @@ class MobileNetV3(): conv = self._conv_bn_layer( input, filter_size=3, - num_filters=inplanes if scale <= 1.0 else int(inplanes * scale), + num_filters=self._make_divisible(inplanes * scale), stride=2, padding=1, num_groups=1, @@ -276,6 +368,7 @@ class MobileNetV3(): act='hard_swish', name='conv1') i = 0 + inplanes = self._make_divisible(inplanes * scale) for layer_cfg in cfg: if layer_cfg[5] == 2: self.block_stride += 1 @@ -285,15 +378,16 @@ class MobileNetV3(): conv = self._residual_unit( input=conv, num_in_filter=inplanes, - num_mid_filter=int(scale * layer_cfg[1]), - num_out_filter=int(scale * layer_cfg[2]), + num_mid_filter=self._make_divisible(scale * layer_cfg[1]), + num_out_filter=self._make_divisible(scale * layer_cfg[2]), act=layer_cfg[4], stride=layer_cfg[5], filter_size=layer_cfg[0], use_se=layer_cfg[3], name='conv' + str(i + 2)) - inplanes = int(scale * layer_cfg[2]) + inplanes = self._make_divisible(scale * layer_cfg[2]) i += 1 + self.curr_stage += 1 self.block_stride += 1 if self.block_stride in self.feature_maps: self.end_points.append(conv) @@ -302,7 +396,7 @@ class MobileNetV3(): conv_extra = self._conv_bn_layer( conv, filter_size=1, - num_filters=int(scale * cfg[-1][1]), + num_filters=self._make_divisible(scale * cfg[-1][1]), stride=1, padding="SAME", num_groups=1, @@ -324,3 +418,132 @@ class MobileNetV3(): return OrderedDict([('mbv3_{}'.format(idx), feat) for idx, feat in enumerate(self.end_points)]) + + +@register +class MobileNetV3RCNN(MobileNetV3): + def __init__( + self, + scale=1.0, + model_name='large', + conv_decay=0.0, + norm_type='bn', + norm_decay=0.0, + freeze_norm=True, + feature_maps=[2, 3, 4, 5], + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], ): + super(MobileNetV3RCNN, self).__init__( + scale=scale, + model_name=model_name, + conv_decay=conv_decay, + norm_type=norm_type, + norm_decay=norm_decay, + lr_mult_list=lr_mult_list, + feature_maps=feature_maps) + self.curr_stage = 0 + self.block_stride = 1 + + def _residual_unit(self, + input, + num_in_filter, + num_mid_filter, + num_out_filter, + stride, + filter_size, + act=None, + use_se=False, + name=None): + input_data = input + conv0 = self._conv_bn_layer( + input=input, + filter_size=1, + num_filters=num_mid_filter, + stride=1, + padding=0, + if_act=True, + act=act, + name=name + '_expand') + + feature_level = int(np.log2(self.block_stride)) + if feature_level in self.feature_maps and stride == 2: + self.end_points.append(conv0) + + conv1 = self._conv_bn_layer( + input=conv0, + filter_size=filter_size, + num_filters=num_mid_filter, + stride=stride, + padding=int((filter_size - 1) // 2), + if_act=True, + act=act, + num_groups=num_mid_filter, + use_cudnn=False, + name=name + '_depthwise') + + if use_se: + conv1 = self._se_block( + input=conv1, num_out_filter=num_mid_filter, name=name + '_se') + + conv2 = self._conv_bn_layer( + input=conv1, + filter_size=1, + num_filters=num_out_filter, + stride=1, + padding=0, + if_act=False, + name=name + '_linear') + if num_in_filter != num_out_filter or stride != 1: + return conv2 + else: + return fluid.layers.elementwise_add(x=input_data, y=conv2, act=None) + + def __call__(self, input): + scale = self.scale + inplanes = self.inplanes + cfg = self.cfg + #conv1 + conv = self._conv_bn_layer( + input, + filter_size=3, + num_filters=self._make_divisible(inplanes * scale), + stride=2, + padding=1, + num_groups=1, + if_act=True, + act='hard_swish', + name='conv1') + i = 0 + inplanes = self._make_divisible(inplanes * scale) + for layer_cfg in cfg: + self.block_stride *= layer_cfg[5] + conv = self._residual_unit( + input=conv, + num_in_filter=inplanes, + num_mid_filter=self._make_divisible(scale * layer_cfg[1]), + num_out_filter=self._make_divisible(scale * layer_cfg[2]), + act=layer_cfg[4], + stride=layer_cfg[5], + filter_size=layer_cfg[0], + use_se=layer_cfg[3], + name='conv' + str(i + 2)) + inplanes = self._make_divisible(scale * layer_cfg[2]) + i += 1 + self.curr_stage += 1 + + if np.max(self.feature_maps) >= 5: + conv = self._conv_bn_layer( + input=conv, + filter_size=1, + num_filters=self._make_divisible(scale * cfg[-1][1]), + stride=1, + padding=0, + num_groups=1, + if_act=True, + act='hard_swish', + name='conv_last') + self.end_points.append(conv) + i += 1 + + res = OrderedDict([('mv3_{}'.format(idx), self.end_points[idx]) + for idx, feat_idx in enumerate(self.feature_maps)]) + return res -- GitLab