diff --git a/configs/acfpn/README.md b/configs/acfpn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..81a3250acc645539978f1fbefbe9d85effb58f85 --- /dev/null +++ b/configs/acfpn/README.md @@ -0,0 +1,16 @@ +# Attention-guided Context Feature Pyramid Network for Object Detection + +## Introduction + +- Attention-guided Context Feature Pyramid Network for Object Detection: [https://arxiv.org/abs/2005.11475](https://arxiv.org/abs/2005.11475) + +``` +Cao J, Chen Q, Guo J, et al. Attention-guided Context Feature Pyramid Network for Object Detection[J]. arXiv preprint arXiv:2005.11475, 2020. +``` + + +## Model Zoo + +| Backbone | Type | Image/gpu | Lr schd | Inf time (fps) | Box AP | Mask AP | Download | Configs | +| :---------------------- | :-------------: | :-------: | :-----: | :------------: | :----: | :-----: | :----------------------------------------------------------: | :-----: | +| ResNet50-vd-ACFPN | Faster | 2 | 1x | 23.432 | 39.6 | - | [model](https://paddlemodels.bj.bcebos.com/object_detection/faster_rcnn_r50_vd_acfpn_1x.pdparams) | [config](https://github.com/PaddlePaddle/PaddleDetection/tree/master/configs/acfpn/faster_rcnn_r50_vd_acfpn_1x.yml) | diff --git a/configs/acfpn/faster_rcnn_r50_vd_acfpn_1x.yml b/configs/acfpn/faster_rcnn_r50_vd_acfpn_1x.yml new file mode 100644 index 0000000000000000000000000000000000000000..f58b983d16713686f6916fd9ac9c3113f3eb378e --- /dev/null +++ b/configs/acfpn/faster_rcnn_r50_vd_acfpn_1x.yml @@ -0,0 +1,107 @@ +architecture: FasterRCNN +max_iters: 90000 +snapshot_iter: 10000 +use_gpu: true +log_smooth_window: 20 +save_dir: output +pretrain_weights: https://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar +weights: output/faster_rcnn_r50_vd_acfpn_1x/model_final +metric: COCO +num_classes: 81 + +FasterRCNN: + backbone: ResNet + fpn: ACFPN + rpn_head: FPNRPNHead + roi_extractor: FPNRoIAlign + bbox_head: BBoxHead + bbox_assigner: BBoxAssigner + +ResNet: + depth: 50 + feature_maps: [2, 3, 4, 5] + freeze_at: 2 + norm_type: bn + variant: d + +ACFPN: + max_level: 6 + min_level: 2 + num_chan: 256 + spatial_scale: [0.03125, 0.0625, 0.125, 0.25] + norm_groups: 32 + +FPNRPNHead: + anchor_generator: + anchor_sizes: [32, 64, 128, 256, 512] + aspect_ratios: [0.5, 1.0, 2.0] + stride: [16.0, 16.0] + variance: [1.0, 1.0, 1.0, 1.0] + anchor_start_size: 32 + max_level: 6 + min_level: 2 + num_chan: 256 + rpn_target_assign: + rpn_batch_size_per_im: 256 + rpn_fg_fraction: 0.5 + rpn_negative_overlap: 0.3 + rpn_positive_overlap: 0.7 + rpn_straddle_thresh: 0.0 + train_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 2000 + pre_nms_top_n: 2000 + test_proposal: + min_size: 0.0 + nms_thresh: 0.7 + post_nms_top_n: 1000 + pre_nms_top_n: 1000 + +FPNRoIAlign: + canconical_level: 4 + canonical_size: 224 + max_level: 5 + min_level: 2 + box_resolution: 7 + sampling_ratio: 2 + +BBoxAssigner: + batch_size_per_im: 512 + bbox_reg_weights: [0.1, 0.1, 0.2, 0.2] + bg_thresh_hi: 0.5 + bg_thresh_lo: 0.0 + fg_fraction: 0.25 + fg_thresh: 0.5 + +BBoxHead: + head: TwoFCHead + nms: + keep_top_k: 100 + nms_threshold: 0.5 + score_threshold: 0.05 + +TwoFCHead: + mlp_dim: 1024 + +LearningRate: + base_lr: 0.02 + schedulers: + - !PiecewiseDecay + gamma: 0.1 + milestones: [60000, 80000] + - !LinearWarmup + start_factor: 0.1 + steps: 500 + +OptimizerBuilder: + optimizer: + momentum: 0.9 + type: Momentum + regularizer: + factor: 0.0001 + type: L2 + +_READER_: '../faster_fpn_reader.yml' +TrainReader: + batch_size: 2 diff --git a/ppdet/modeling/backbones/__init__.py b/ppdet/modeling/backbones/__init__.py index aa33cb11f8dddf9d15ceb7611c59922b30837c28..30d1b9f50ebbfafeedb39819d4724b3a5e1621d6 100644 --- a/ppdet/modeling/backbones/__init__.py +++ b/ppdet/modeling/backbones/__init__.py @@ -33,6 +33,7 @@ from . import hourglass from . import efficientnet from . import bifpn from . import cspdarknet +from . import acfpn from .resnet import * from .resnext import * @@ -53,3 +54,4 @@ from .hourglass import * from .efficientnet import * from .bifpn import * from .cspdarknet import * +from .acfpn import * diff --git a/ppdet/modeling/backbones/acfpn.py b/ppdet/modeling/backbones/acfpn.py new file mode 100644 index 0000000000000000000000000000000000000000..852586b2fdd2495e761fe7e40301ba65cd721d23 --- /dev/null +++ b/ppdet/modeling/backbones/acfpn.py @@ -0,0 +1,338 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict +import copy +from paddle import fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Xavier +from paddle.fluid.regularizer import L2Decay + +from ppdet.core.workspace import register +from ppdet.modeling.ops import ConvNorm + +__all__ = ['ACFPN'] + + +@register +class ACFPN(object): + """ + Attention-guided Context Feature Pyramid Network for Object Detection, + see https://arxiv.org/abs/2005.11475 + + Args: + num_chan (int): number of feature channels + min_level (int): lowest level of the backbone feature map to use + max_level (int): highest level of the backbone feature map to use + spatial_scale (list): feature map scaling factor + has_extra_convs (bool): whether has extral convolutions in higher levels + norm_type (str|None): normalization type, 'bn'/'sync_bn'/'affine_channel' + use_c5 (bool): whether to use C5 as the feature map. + norm_groups (int): group number of group norm. + """ + __shared__ = ['norm_type', 'freeze_norm'] + + def __init__(self, + num_chan=256, + min_level=2, + max_level=6, + spatial_scale=[1. / 32., 1. / 16., 1. / 8., 1. / 4.], + has_extra_convs=False, + norm_type=None, + freeze_norm=False, + use_c5=True, + norm_groups=32): + self.freeze_norm = freeze_norm + self.num_chan = num_chan + self.min_level = min_level + self.max_level = max_level + self.spatial_scale = spatial_scale + self.has_extra_convs = has_extra_convs + self.norm_type = norm_type + self.use_c5 = use_c5 + self.norm_groups = norm_groups + + def _add_topdown_lateral(self, body_name, body_input, upper_output): + lateral_name = 'fpn_inner_' + body_name + '_lateral' + topdown_name = 'fpn_topdown_' + body_name + fan = body_input.shape[1] + if self.norm_type: + initializer = Xavier(fan_out=fan) + lateral = ConvNorm( + body_input, + self.num_chan, + 1, + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=lateral_name, + norm_name=lateral_name) + else: + lateral = fluid.layers.conv2d( + body_input, + self.num_chan, + 1, + param_attr=ParamAttr( + name=lateral_name + "_w", initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=lateral_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=lateral_name) + topdown = fluid.layers.resize_nearest( + upper_output, scale=2., name=topdown_name) + + return lateral + topdown + + def dense_aspp_block(self, input, num_filters1, num_filters2, dilation_rate, + dropout_prob, name): + + conv = ConvNorm( + input, + num_filters=num_filters1, + filter_size=1, + stride=1, + groups=1, + norm_decay=0., + norm_type='gn', + norm_groups=self.norm_groups, + dilation=dilation_rate, + lr_scale=1, + freeze_norm=False, + act="relu", + norm_name=name + "_gn", + initializer=None, + bias_attr=False, + name=name + "_gn") + + conv = fluid.layers.conv2d( + conv, + num_filters2, + filter_size=3, + padding=dilation_rate, + dilation=dilation_rate, + act="relu", + param_attr=ParamAttr(name=name + "_conv_w"), + bias_attr=ParamAttr(name=name + "_conv_b"), ) + + if dropout_prob > 0: + conv = fluid.layers.dropout(conv, dropout_prob=dropout_prob) + + return conv + + def dense_aspp(self, input, name=None): + dropout0 = 0.1 + d_feature0 = 512 + d_feature1 = 256 + + aspp3 = self.dense_aspp_block( + input, + num_filters1=d_feature0, + num_filters2=d_feature1, + dropout_prob=dropout0, + name=name + '_aspp3', + dilation_rate=3) + conv = fluid.layers.concat([aspp3, input], axis=1) + + aspp6 = self.dense_aspp_block( + conv, + num_filters1=d_feature0, + num_filters2=d_feature1, + dropout_prob=dropout0, + name=name + '_aspp6', + dilation_rate=6) + conv = fluid.layers.concat([aspp6, conv], axis=1) + + aspp12 = self.dense_aspp_block( + conv, + num_filters1=d_feature0, + num_filters2=d_feature1, + dropout_prob=dropout0, + name=name + '_aspp12', + dilation_rate=12) + conv = fluid.layers.concat([aspp12, conv], axis=1) + + aspp18 = self.dense_aspp_block( + conv, + num_filters1=d_feature0, + num_filters2=d_feature1, + dropout_prob=dropout0, + name=name + '_aspp18', + dilation_rate=18) + conv = fluid.layers.concat([aspp18, conv], axis=1) + + aspp24 = self.dense_aspp_block( + conv, + num_filters1=d_feature0, + num_filters2=d_feature1, + dropout_prob=dropout0, + name=name + '_aspp24', + dilation_rate=24) + + conv = fluid.layers.concat( + [aspp3, aspp6, aspp12, aspp18, aspp24], axis=1) + + conv = ConvNorm( + conv, + num_filters=self.num_chan, + filter_size=1, + stride=1, + groups=1, + norm_decay=0., + norm_type='gn', + norm_groups=self.norm_groups, + dilation=1, + lr_scale=1, + freeze_norm=False, + act="relu", + norm_name=name + "_dense_aspp_reduce_gn", + initializer=None, + bias_attr=False, + name=name + "_dense_aspp_reduce_gn") + + return conv + + def get_output(self, body_dict): + """ + Add FPN onto backbone. + + Args: + body_dict(OrderedDict): Dictionary of variables and each element is the + output of backbone. + + Return: + fpn_dict(OrderedDict): A dictionary represents the output of FPN with + their name. + spatial_scale(list): A list of multiplicative spatial scale factor. + """ + spatial_scale = copy.deepcopy(self.spatial_scale) + body_name_list = list(body_dict.keys())[::-1] + num_backbone_stages = len(body_name_list) + self.fpn_inner_output = [[] for _ in range(num_backbone_stages)] + fpn_inner_name = 'fpn_inner_' + body_name_list[0] + body_input = body_dict[body_name_list[0]] + fan = body_input.shape[1] + if self.norm_type: + initializer = Xavier(fan_out=fan) + self.fpn_inner_output[0] = ConvNorm( + body_input, + self.num_chan, + 1, + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=fpn_inner_name, + norm_name=fpn_inner_name) + else: + self.fpn_inner_output[0] = fluid.layers.conv2d( + body_input, + self.num_chan, + 1, + param_attr=ParamAttr( + name=fpn_inner_name + "_w", + initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=fpn_inner_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=fpn_inner_name) + + self.fpn_inner_output[0] += self.dense_aspp( + self.fpn_inner_output[0], name="acfpn") + + for i in range(1, num_backbone_stages): + body_name = body_name_list[i] + body_input = body_dict[body_name] + top_output = self.fpn_inner_output[i - 1] + fpn_inner_single = self._add_topdown_lateral(body_name, body_input, + top_output) + self.fpn_inner_output[i] = fpn_inner_single + fpn_dict = {} + fpn_name_list = [] + for i in range(num_backbone_stages): + fpn_name = 'fpn_' + body_name_list[i] + fan = self.fpn_inner_output[i].shape[1] * 3 * 3 + if self.norm_type: + initializer = Xavier(fan_out=fan) + fpn_output = ConvNorm( + self.fpn_inner_output[i], + self.num_chan, + 3, + initializer=initializer, + norm_type=self.norm_type, + freeze_norm=self.freeze_norm, + name=fpn_name, + norm_name=fpn_name) + else: + fpn_output = fluid.layers.conv2d( + self.fpn_inner_output[i], + self.num_chan, + filter_size=3, + padding=1, + param_attr=ParamAttr( + name=fpn_name + "_w", initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=fpn_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=fpn_name) + fpn_dict[fpn_name] = fpn_output + fpn_name_list.append(fpn_name) + if not self.has_extra_convs and self.max_level - self.min_level == len( + spatial_scale): + body_top_name = fpn_name_list[0] + body_top_extension = fluid.layers.pool2d( + fpn_dict[body_top_name], + 1, + 'max', + pool_stride=2, + name=body_top_name + '_subsampled_2x') + fpn_dict[body_top_name + '_subsampled_2x'] = body_top_extension + fpn_name_list.insert(0, body_top_name + '_subsampled_2x') + spatial_scale.insert(0, spatial_scale[0] * 0.5) + # Coarser FPN levels introduced for RetinaNet + highest_backbone_level = self.min_level + len(spatial_scale) - 1 + if self.has_extra_convs and self.max_level > highest_backbone_level: + if self.use_c5: + fpn_blob = body_dict[body_name_list[0]] + else: + fpn_blob = fpn_dict[fpn_name_list[0]] + for i in range(highest_backbone_level + 1, self.max_level + 1): + fpn_blob_in = fpn_blob + fpn_name = 'fpn_' + str(i) + if i > highest_backbone_level + 1: + fpn_blob_in = fluid.layers.relu(fpn_blob) + fan = fpn_blob_in.shape[1] * 3 * 3 + fpn_blob = fluid.layers.conv2d( + input=fpn_blob_in, + num_filters=self.num_chan, + filter_size=3, + stride=2, + padding=1, + param_attr=ParamAttr( + name=fpn_name + "_w", initializer=Xavier(fan_out=fan)), + bias_attr=ParamAttr( + name=fpn_name + "_b", + learning_rate=2., + regularizer=L2Decay(0.)), + name=fpn_name) + fpn_dict[fpn_name] = fpn_blob + fpn_name_list.insert(0, fpn_name) + spatial_scale.insert(0, spatial_scale[0] * 0.5) + res_dict = OrderedDict([(k, fpn_dict[k]) for k in fpn_name_list]) + return res_dict, spatial_scale