diff --git a/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml index 8fef452e859ebf8171ff5bbc6d5e2a3d85929221..ea2937babd488b1e874f75494093d942366315e5 100644 --- a/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml +++ b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml @@ -64,7 +64,7 @@ BBoxAssigner: use_random: True CascadeTwoFCHead: - mlp_dim: 1024 + out_channel: 1024 BBoxPostProcess: decode: @@ -88,7 +88,7 @@ MaskHead: MaskFeat: num_convs: 4 - out_channels: 256 + out_channel: 256 MaskAssigner: mask_resolution: 28 diff --git a/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml index 51905687ebbd16413504667493bce81d4b211e6a..c5afe774347209812ed759e31fb03e5aff677d96 100644 --- a/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml +++ b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml @@ -62,7 +62,7 @@ BBoxAssigner: use_random: True CascadeTwoFCHead: - mlp_dim: 1024 + out_channel: 1024 BBoxPostProcess: decode: diff --git a/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml index aa5f5b28811e263bdc5c256d8b381e810c6b7196..38ee81def0cb528f3f67e8ed616b9589bd72de9e 100644 --- a/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml +++ b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml @@ -61,7 +61,7 @@ BBoxAssigner: use_random: True TwoFCHead: - mlp_dim: 1024 + out_channel: 1024 BBoxPostProcess: diff --git a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml index 1281148da634bb40862d37db7e60266d79203f40..e2c750dfbe481eb6875fff6df0febba69d0ab947 100644 --- a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml +++ b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml @@ -31,7 +31,7 @@ CascadeHead: CascadeXConvNormHead: num_convs: 4 - mlp_dim: 1024 + out_channel: 1024 norm_type: gn MaskHead: @@ -45,7 +45,7 @@ MaskHead: MaskFeat: num_convs: 4 - out_channels: 256 + out_channel: 256 norm_type: gn diff --git a/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml index 701b8306ec2e4c7cc854fc71569716db40c00a2e..2706790ed77301739e9d1374e9292f16a0c1c090 100644 --- a/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml +++ b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml @@ -21,7 +21,7 @@ CascadeHead: CascadeXConvNormHead: num_convs: 4 - mlp_dim: 1024 + out_channel: 1024 norm_type: gn diff --git a/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml index e8eb5679347900cbbedde7363cffb6d43ede13bd..200a98b4b9fb615c17b7bd42f88b3bb1b2474370 100644 --- a/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml +++ b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml @@ -29,7 +29,7 @@ BBoxHead: XConvNormHead: num_convs: 4 - mlp_dim: 1024 + out_channel: 1024 norm_type: gn diff --git a/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml index 2104fa901e194fbbf3c66aef36ae1ffaf0f88272..70beaf5851df945745c904dc9932928d9cedac01 100644 --- a/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml +++ b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml @@ -31,7 +31,7 @@ BBoxHead: XConvNormHead: num_convs: 4 - mlp_dim: 1024 + out_channel: 1024 norm_type: gn MaskHead: @@ -45,7 +45,7 @@ MaskHead: MaskFeat: num_convs: 4 - out_channels: 256 + out_channel: 256 norm_type: gn diff --git a/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml index cf6645d1e338ca303876fed5c5f480cb505eeca6..6c556f306fdc2ea5bd320376236143984f4cba6a 100644 --- a/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml +++ b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml @@ -57,7 +57,7 @@ BBoxAssigner: use_random: True TwoFCHead: - mlp_dim: 1024 + out_channel: 1024 BBoxPostProcess: decode: RCNNBox diff --git a/configs/mask_rcnn/_base_/mask_rcnn_r50.yml b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml index aa6e0db56920c15099a05f544d92ab3e3250b7a1..04dab63701171ada046b60e687422e06f8043c26 100644 --- a/configs/mask_rcnn/_base_/mask_rcnn_r50.yml +++ b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml @@ -78,7 +78,7 @@ MaskHead: MaskFeat: num_convs: 0 - out_channels: 256 + out_channel: 256 MaskAssigner: mask_resolution: 14 diff --git a/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml index 74004b281096346e7127950bb6022a7cd55e90ba..dd7587669661a9e24431a167835ef89527f5e0c8 100644 --- a/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml +++ b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml @@ -61,7 +61,7 @@ BBoxAssigner: use_random: True TwoFCHead: - mlp_dim: 1024 + out_channel: 1024 BBoxPostProcess: decode: RCNNBox @@ -82,7 +82,7 @@ MaskHead: MaskFeat: num_convs: 4 - out_channels: 256 + out_channel: 256 MaskAssigner: mask_resolution: 28 diff --git a/ppdet/modeling/heads/bbox_head.py b/ppdet/modeling/heads/bbox_head.py index a6480961cd1b6a4cdcbfa29b3143bf48e25eb8ec..0c75f8f1d9deb1c43818e9de44837b500112c729 100644 --- a/ppdet/modeling/heads/bbox_head.py +++ b/ppdet/modeling/heads/bbox_head.py @@ -31,31 +31,40 @@ __all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead'] @register class TwoFCHead(nn.Layer): - def __init__(self, in_dim=256, mlp_dim=1024, resolution=7): + """ + RCNN bbox head with Two fc layers to extract feature + + Args: + in_channel (int): Input channel which can be derived by from_config + out_channel (int): Output channel + resolution (int): Resolution of input feature map, default 7 + """ + + def __init__(self, in_channel=256, out_channel=1024, resolution=7): super(TwoFCHead, self).__init__() - self.in_dim = in_dim - self.mlp_dim = mlp_dim - fan = in_dim * resolution * resolution + self.in_channel = in_channel + self.out_channel = out_channel + fan = in_channel * resolution * resolution self.fc6 = nn.Linear( - in_dim * resolution * resolution, - mlp_dim, + in_channel * resolution * resolution, + out_channel, weight_attr=paddle.ParamAttr( initializer=XavierUniform(fan_out=fan))) self.fc7 = nn.Linear( - mlp_dim, - mlp_dim, + out_channel, + out_channel, weight_attr=paddle.ParamAttr(initializer=XavierUniform())) @classmethod def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_dim': s.channels} + return {'in_channel': s.channels} @property def out_shape(self): - return [ShapeSpec(channels=self.mlp_dim, )] + return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat): rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) @@ -68,34 +77,36 @@ class TwoFCHead(nn.Layer): @register class XConvNormHead(nn.Layer): + __shared__ = ['norm_type', 'freeze_norm'] """ RCNN bbox head with serveral convolution layers + Args: - in_dim(int): num of channels for the input rois_feat - num_convs(int): num of convolution layers for the rcnn bbox head - conv_dim(int): num of channels for the conv layers - mlp_dim(int): num of channels for the fc layers - resolution(int): resolution of the rois_feat - norm_type(str): norm type, 'gn' by defalut - freeze_norm(bool): whether to freeze the norm - stage_name(str): used in CascadeXConvNormHead, '' by default + in_channel (int): Input channels which can be derived by from_config + num_convs (int): The number of conv layers + conv_dim (int): The number of channels for the conv layers + out_channel (int): Output channels + resolution (int): Resolution of input feature map + norm_type (string): Norm type, bn, gn, sync_bn are available, + default `gn` + freeze_norm (bool): Whether to freeze the norm + stage_name (string): Prefix name for conv layer, '' by default """ - __shared__ = ['norm_type', 'freeze_norm'] def __init__(self, - in_dim=256, + in_channel=256, num_convs=4, conv_dim=256, - mlp_dim=1024, + out_channel=1024, resolution=7, norm_type='gn', freeze_norm=False, stage_name=''): super(XConvNormHead, self).__init__() - self.in_dim = in_dim + self.in_channel = in_channel self.num_convs = num_convs self.conv_dim = conv_dim - self.mlp_dim = mlp_dim + self.out_channel = out_channel self.norm_type = norm_type self.freeze_norm = freeze_norm @@ -103,7 +114,7 @@ class XConvNormHead(nn.Layer): fan = conv_dim * 3 * 3 initializer = KaimingNormal(fan_in=fan) for i in range(self.num_convs): - in_c = in_dim if i == 0 else conv_dim + in_c = in_channel if i == 0 else conv_dim head_conv_name = stage_name + 'bbox_head_conv{}'.format(i) head_conv = self.add_sublayer( head_conv_name, @@ -122,7 +133,7 @@ class XConvNormHead(nn.Layer): fan = conv_dim * resolution * resolution self.fc6 = nn.Linear( conv_dim * resolution * resolution, - mlp_dim, + out_channel, weight_attr=paddle.ParamAttr( initializer=XavierUniform(fan_out=fan)), bias_attr=paddle.ParamAttr( @@ -132,11 +143,11 @@ class XConvNormHead(nn.Layer): def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_dim': s.channels} + return {'in_channel': s.channels} @property def out_shape(self): - return [ShapeSpec(channels=self.mlp_dim, )] + return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat): for i in range(self.num_convs): @@ -151,14 +162,17 @@ class BBoxHead(nn.Layer): __shared__ = ['num_classes'] __inject__ = ['bbox_assigner'] """ - head (nn.Layer): Extract feature in bbox head - in_channel (int): Input channel after RoI extractor - roi_extractor (object): The module of RoI Extractor - bbox_assigner (object): The module of Box Assigner, label and sample the - box. - with_pool (bool): Whether to use pooling for the RoI feature. - num_classes (int): The number of classes - bbox_weight (List[float]): The weight to get the decode box + RCNN bbox head + + Args: + head (nn.Layer): Extract feature in bbox head + in_channel (int): Input channel after RoI extractor + roi_extractor (object): The module of RoI Extractor + bbox_assigner (object): The module of Box Assigner, label and sample the + box. + with_pool (bool): Whether to use pooling for the RoI feature. + num_classes (int): The number of classes + bbox_weight (List[float]): The weight to get the decode box """ def __init__(self, diff --git a/ppdet/modeling/heads/cascade_head.py b/ppdet/modeling/heads/cascade_head.py index 99c43c83e90232de63663f872b95b88e0c81f1a5..0ee23c040ff51f418dca2f4f399b3e6618fe873e 100644 --- a/ppdet/modeling/heads/cascade_head.py +++ b/ppdet/modeling/heads/cascade_head.py @@ -32,32 +32,41 @@ __all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead'] @register class CascadeTwoFCHead(nn.Layer): __shared__ = ['num_cascade_stage'] + """ + Cascade RCNN bbox head with Two fc layers to extract feature + + Args: + in_channel (int): Input channel which can be derived by from_config + out_channel (int): Output channel + resolution (int): Resolution of input feature map, default 7 + num_cascade_stage (int): The number of cascade stage, default 3 + """ def __init__(self, - in_dim=256, - mlp_dim=1024, + in_channel=256, + out_channel=1024, resolution=7, num_cascade_stage=3): super(CascadeTwoFCHead, self).__init__() - self.in_dim = in_dim - self.mlp_dim = mlp_dim + self.in_channel = in_channel + self.out_channel = out_channel self.head_list = [] for stage in range(num_cascade_stage): head_per_stage = self.add_sublayer( - str(stage), TwoFCHead(in_dim, mlp_dim, resolution)) + str(stage), TwoFCHead(in_channel, out_channel, resolution)) self.head_list.append(head_per_stage) @classmethod def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_dim': s.channels} + return {'in_channel': s.channels} @property def out_shape(self): - return [ShapeSpec(channels=self.mlp_dim, )] + return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat, stage=0): out = self.head_list[stage](rois_feat) @@ -67,29 +76,43 @@ class CascadeTwoFCHead(nn.Layer): @register class CascadeXConvNormHead(nn.Layer): __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage'] + """ + Cascade RCNN bbox head with serveral convolution layers + + Args: + in_channel (int): Input channels which can be derived by from_config + num_convs (int): The number of conv layers + conv_dim (int): The number of channels for the conv layers + out_channel (int): Output channels + resolution (int): Resolution of input feature map + norm_type (string): Norm type, bn, gn, sync_bn are available, + default `gn` + freeze_norm (bool): Whether to freeze the norm + num_cascade_stage (int): The number of cascade stage, default 3 + """ def __init__(self, - in_dim=256, + in_channel=256, num_convs=4, conv_dim=256, - mlp_dim=1024, + out_channel=1024, resolution=7, norm_type='gn', freeze_norm=False, num_cascade_stage=3): super(CascadeXConvNormHead, self).__init__() - self.in_dim = in_dim - self.mlp_dim = mlp_dim + self.in_channel = in_channel + self.out_channel = out_channel self.head_list = [] for stage in range(num_cascade_stage): head_per_stage = self.add_sublayer( str(stage), XConvNormHead( - in_dim, + in_channel, num_convs, conv_dim, - mlp_dim, + out_channel, resolution, norm_type, freeze_norm, @@ -100,11 +123,11 @@ class CascadeXConvNormHead(nn.Layer): def from_config(cls, cfg, input_shape): s = input_shape s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_dim': s.channels} + return {'in_channel': s.channels} @property def out_shape(self): - return [ShapeSpec(channels=self.mlp_dim, )] + return [ShapeSpec(channels=self.out_channel, )] def forward(self, rois_feat, stage=0): out = self.head_list[stage](rois_feat) @@ -116,16 +139,18 @@ class CascadeHead(BBoxHead): __shared__ = ['num_classes', 'num_cascade_stages'] __inject__ = ['bbox_assigner'] """ - head (nn.Layer): Extract feature in bbox head - in_channel (int): Input channel after RoI extractor - roi_extractor (object): The module of RoI Extractor - bbox_assigner (object): The module of Box Assigner, label and sample the - box. - num_classes (int): The number of classes - bbox_weight (List[List[float]]): The weight to get the decode box and the - length of weight is the number of cascade - stage - num_cascade_stages (int): THe number of stage to refine the box + Cascade RCNN bbox head + + Args: + head (nn.Layer): Extract feature in bbox head + in_channel (int): Input channel after RoI extractor + roi_extractor (object): The module of RoI Extractor + bbox_assigner (object): The module of Box Assigner, label and sample the + box. + num_classes (int): The number of classes + bbox_weight (List[List[float]]): The weight to get the decode box and the + length of weight is the number of cascade stage + num_cascade_stages (int): THe number of stage to refine the box """ def __init__(self, diff --git a/ppdet/modeling/heads/mask_head.py b/ppdet/modeling/heads/mask_head.py index dc624ff838e8b9dcb66e024fcbf83fcdbb08cf4a..eea70922a483e16cc379e394235b396307391e4c 100644 --- a/ppdet/modeling/heads/mask_head.py +++ b/ppdet/modeling/heads/mask_head.py @@ -27,18 +27,29 @@ from .roi_extractor import RoIAlign @register class MaskFeat(nn.Layer): + """ + Feature extraction in Mask head + + Args: + in_channel (int): Input channels + out_channel (int): Output channels + num_convs (int): The number of conv layers, default 4 + norm_type (string | None): Norm type, bn, gn, sync_bn are available, + default None + """ + def __init__(self, + in_channel=256, + out_channel=256, num_convs=4, - in_channels=256, - out_channels=256, norm_type=None): super(MaskFeat, self).__init__() self.num_convs = num_convs - self.in_channels = in_channels - self.out_channels = out_channels + self.in_channel = in_channel + self.out_channel = out_channel self.norm_type = norm_type - fan_conv = out_channels * 3 * 3 - fan_deconv = out_channels * 2 * 2 + fan_conv = out_channel * 3 * 3 + fan_deconv = out_channel * 2 * 2 mask_conv = nn.Sequential() if norm_type == 'gn': @@ -47,8 +58,8 @@ class MaskFeat(nn.Layer): mask_conv.add_sublayer( conv_name, ConvNormLayer( - ch_in=in_channels if i == 0 else out_channels, - ch_out=out_channels, + ch_in=in_channel if i == 0 else out_channel, + ch_out=out_channel, filter_size=3, stride=1, norm_type=self.norm_type, @@ -62,8 +73,8 @@ class MaskFeat(nn.Layer): mask_conv.add_sublayer( conv_name, nn.Conv2D( - in_channels=in_channels if i == 0 else out_channels, - out_channels=out_channels, + in_channels=in_channel if i == 0 else out_channel, + out_channels=out_channel, kernel_size=3, padding=1, weight_attr=paddle.ParamAttr( @@ -72,8 +83,8 @@ class MaskFeat(nn.Layer): mask_conv.add_sublayer( 'conv5_mask', nn.Conv2DTranspose( - in_channels=self.in_channels, - out_channels=self.out_channels, + in_channels=self.in_channel, + out_channels=self.out_channel, kernel_size=2, stride=2, weight_attr=paddle.ParamAttr( @@ -85,10 +96,10 @@ class MaskFeat(nn.Layer): def from_config(cls, cfg, input_shape): if isinstance(input_shape, (list, tuple)): input_shape = input_shape[0] - return {'in_channels': input_shape.channels, } + return {'in_channel': input_shape.channels, } - def out_channel(self): - return self.out_channels + def out_channels(self): + return self.out_channel def forward(self, feats): return self.upsample(feats) @@ -98,6 +109,18 @@ class MaskFeat(nn.Layer): class MaskHead(nn.Layer): __shared__ = ['num_classes'] __inject__ = ['mask_assigner'] + """ + RCNN mask head + + Args: + head (nn.Layer): Extract feature in mask head + roi_extractor (object): The module of RoI Extractor + mask_assigner (object): The module of Mask Assigner, + label and sample the mask + num_classes (int): The number of classes + share_bbox_feat (bool): Whether to share the feature from bbox head, + default false + """ def __init__(self, head, @@ -112,7 +135,7 @@ class MaskHead(nn.Layer): if isinstance(roi_extractor, dict): self.roi_extractor = RoIAlign(**roi_extractor) self.head = head - self.in_channels = head.out_channel() + self.in_channels = head.out_channels() self.mask_assigner = mask_assigner self.share_bbox_feat = share_bbox_feat self.bbox_head = None @@ -159,7 +182,6 @@ class MaskHead(nn.Layer): rois_num (Tensor): The number of proposals for each batch inputs (dict): ground truth info """ - #assert self.bbox_head tgt_labels, _, tgt_gt_inds = targets rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner( rois, tgt_labels, tgt_gt_inds, inputs) diff --git a/ppdet/modeling/heads/roi_extractor.py b/ppdet/modeling/heads/roi_extractor.py index 1e2f658a7b1a67a9b86a3ca881177bc74ecde737..35c3924e36c60ddbc82f38f6b828197e31833b01 100644 --- a/ppdet/modeling/heads/roi_extractor.py +++ b/ppdet/modeling/heads/roi_extractor.py @@ -25,6 +25,31 @@ def _to_list(v): @register class RoIAlign(object): + """ + RoI Align module + + For more details, please refer to the document of roi_align in + in ppdet/modeing/ops.py + + Args: + resolution (int): The output size, default 14 + spatial_scale (float): Multiplicative spatial scale factor to translate + ROI coords from their input scale to the scale used when pooling. + default 0.0625 + sampling_ratio (int): The number of sampling points in the interpolation + grid, default 0 + canconical_level (int): The referring level of FPN layer with + specified level. default 4 + canonical_size (int): The referring scale of FPN layer with + specified scale. default 224 + start_level (int): The start level of FPN layer to extract RoI feature, + default 0 + end_level (int): The end level of FPN layer to extract RoI feature, + default 3 + aligned (bool): Whether to add offset to rois' coord in roi_align. + default false + """ + def __init__(self, resolution=14, spatial_scale=0.0625, diff --git a/ppdet/modeling/heads/rpn_head.py b/ppdet/modeling/heads/rpn_head.py deleted file mode 100644 index 64f7acc495326d4edbbff389e5351f602e67f0de..0000000000000000000000000000000000000000 --- a/ppdet/modeling/heads/rpn_head.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Normal -from paddle.regularizer import L2Decay -from paddle.nn import Conv2D - -from ppdet.core.workspace import register -from ppdet.modeling import ops - - -@register -class RPNFeat(nn.Layer): - def __init__(self, feat_in=1024, feat_out=1024): - super(RPNFeat, self).__init__() - # rpn feat is shared with each level - self.rpn_conv = Conv2D( - in_channels=feat_in, - out_channels=feat_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.))) - - def forward(self, inputs, feats): - rpn_feats = [] - for feat in feats: - rpn_feats.append(F.relu(self.rpn_conv(feat))) - return rpn_feats - - -@register -class RPNHead(nn.Layer): - __inject__ = ['rpn_feat'] - - def __init__(self, rpn_feat, anchor_per_position=15, rpn_channel=1024): - super(RPNHead, self).__init__() - self.rpn_feat = rpn_feat - if isinstance(rpn_feat, dict): - self.rpn_feat = RPNFeat(**rpn_feat) - # rpn head is shared with each level - # rpn roi classification scores - self.rpn_rois_score = Conv2D( - in_channels=rpn_channel, - out_channels=anchor_per_position, - kernel_size=1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.))) - - # rpn roi bbox regression deltas - self.rpn_rois_delta = Conv2D( - in_channels=rpn_channel, - out_channels=4 * anchor_per_position, - kernel_size=1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.))) - - def forward(self, inputs, feats): - rpn_feats = self.rpn_feat(inputs, feats) - rpn_head_out = [] - for rpn_feat in rpn_feats: - rrs = self.rpn_rois_score(rpn_feat) - rrd = self.rpn_rois_delta(rpn_feat) - rpn_head_out.append((rrs, rrd)) - return rpn_feats, rpn_head_out - - def get_loss(self, loss_inputs): - # cls loss - score_tgt = paddle.cast( - x=loss_inputs['rpn_score_target'], dtype='float32') - score_tgt.stop_gradient = True - loss_rpn_cls = ops.sigmoid_cross_entropy_with_logits( - input=loss_inputs['rpn_score_pred'], label=score_tgt) - loss_rpn_cls = paddle.mean(loss_rpn_cls, name='loss_rpn_cls') - - # reg loss - loc_tgt = paddle.cast(x=loss_inputs['rpn_rois_target'], dtype='float32') - loc_tgt.stop_gradient = True - loss_rpn_reg = ops.smooth_l1( - input=loss_inputs['rpn_rois_pred'], - label=loc_tgt, - inside_weight=loss_inputs['rpn_rois_weight'], - outside_weight=loss_inputs['rpn_rois_weight'], - sigma=3.0, ) - loss_rpn_reg = paddle.sum(loss_rpn_reg) - score_shape = paddle.shape(score_tgt) - score_shape = paddle.cast(score_shape, dtype='float32') - norm = paddle.prod(score_shape) - norm.stop_gradient = True - loss_rpn_reg = loss_rpn_reg / norm - - return {'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_reg': loss_rpn_reg} diff --git a/ppdet/modeling/necks/fpn.py b/ppdet/modeling/necks/fpn.py index 85767bb105dd4d134b339e37b9f759016ce9f369..0b9f6a798bdc0d87630d96135f86cc8dc2802506 100644 --- a/ppdet/modeling/necks/fpn.py +++ b/ppdet/modeling/necks/fpn.py @@ -29,6 +29,34 @@ __all__ = ['FPN'] @register @serializable class FPN(nn.Layer): + """ + Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 + + Args: + in_channels (list[int]): input channels of each level which can be + derived from the output shape of backbone by from_config + out_channel (list[int]): output channel of each level + spatial_scales (list[float]): the spatial scales between input feature + maps and original input image which can be derived from the output + shape of backbone by from_config + has_extra_convs (bool): whether to add extra conv to the last level. + default False + extra_stage (int): the number of extra stages added to the last level. + default 1 + use_c5 (bool): Whether to use c5 as the input of extra stage, + otherwise p5 is used. default True + norm_type (string|None): The normalization type in FPN module. If + norm_type is None, norm will not be used after conv and if + norm_type is string, bn, gn, sync_bn are available. default None + norm_decay (float): weight decay for normalization layer weights. + default 0. + freeze_norm (bool): whether to freeze normalization layer. + default False + relu_before_extra_convs (bool): whether to add relu before extra convs. + default False + + """ + def __init__(self, in_channels, out_channel, @@ -67,7 +95,7 @@ class FPN(nn.Layer): else: lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) in_c = in_channels[i - st_stage] - if self.norm_type == 'gn': + if self.norm_type is not None: lateral = self.add_sublayer( lateral_name, ConvNormLayer( @@ -93,7 +121,7 @@ class FPN(nn.Layer): self.lateral_convs.append(lateral) fpn_name = 'fpn_res{}_sum'.format(i + 2) - if self.norm_type == 'gn': + if self.norm_type is not None: fpn_conv = self.add_sublayer( fpn_name, ConvNormLayer( @@ -128,7 +156,7 @@ class FPN(nn.Layer): else: in_c = out_channel extra_fpn_name = 'fpn_{}'.format(lvl + 2) - if self.norm_type == 'gn': + if self.norm_type is not None: extra_fpn_conv = self.add_sublayer( extra_fpn_name, ConvNormLayer( diff --git a/ppdet/modeling/proposal_generator/anchor_generator.py b/ppdet/modeling/proposal_generator/anchor_generator.py index 1ca0319d3ad13d3650022d8d958c0f92954914c9..8088ffa04affa3c2ecd81348d9eb7cb749f7b4f8 100644 --- a/ppdet/modeling/proposal_generator/anchor_generator.py +++ b/ppdet/modeling/proposal_generator/anchor_generator.py @@ -25,6 +25,24 @@ from .. import ops @register class AnchorGenerator(nn.Layer): + """ + Generate anchors according to the feature maps + + Args: + anchor_sizes (list[float] | list[list[float]]): The anchor sizes at + each feature point. list[float] means all feature levels share the + same sizes. list[list[float]] means the anchor sizes for + each level. The sizes stand for the scale of input size. + aspect_ratios (list[float] | list[list[float]]): The aspect ratios at + each feature point. list[float] means all feature levels share the + same ratios. list[list[float]] means the aspect ratios for + each level. + strides (list[float]): The strides of feature maps which generate + anchors + offset (float): The offset of the coordinate of anchors, default 0. + + """ + def __init__(self, anchor_sizes=[32, 64, 128, 256, 512], aspect_ratios=[0.5, 1.0, 2.0], diff --git a/ppdet/modeling/proposal_generator/proposal_generator.py b/ppdet/modeling/proposal_generator/proposal_generator.py index 8a5df53255d080ec83d083bd0db72b41ca8700b4..12518e48817233e705638163d59e9fe9a9986938 100644 --- a/ppdet/modeling/proposal_generator/proposal_generator.py +++ b/ppdet/modeling/proposal_generator/proposal_generator.py @@ -25,6 +25,28 @@ from .. import ops @register @serializable class ProposalGenerator(object): + """ + Proposal generation module + + For more details, please refer to the document of generate_proposals + in ppdet/modeing/ops.py + + Args: + pre_nms_top_n (int): Number of total bboxes to be kept per + image before NMS. default 6000 + post_nms_top_n (int): Number of total bboxes to be kept per + image after NMS. default 1000 + nms_thresh (float): Threshold in NMS. default 0.5 + min_size (flaot): Remove predicted boxes with either height or + width < min_size. default 0.1 + eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, + `adaptive_threshold = adaptive_threshold * eta` in each iteration. + default 1. + topk_after_collect (bool): whether to adopt topk after batch + collection. If topk_after_collect is true, box filter will not be + used after NMS at each image in proposal generation. default false + """ + def __init__(self, pre_nms_top_n=12000, post_nms_top_n=2000, diff --git a/ppdet/modeling/proposal_generator/rpn_head.py b/ppdet/modeling/proposal_generator/rpn_head.py index 6a1c980a452f0390b6e7355210cfc190ceab184a..2b1e6c77b7cb30511794d1e2c283cf81759c2857 100644 --- a/ppdet/modeling/proposal_generator/rpn_head.py +++ b/ppdet/modeling/proposal_generator/rpn_head.py @@ -27,12 +27,20 @@ from .proposal_generator import ProposalGenerator class RPNFeat(nn.Layer): - def __init__(self, feat_in=1024, feat_out=1024): + """ + Feature extraction in RPN head + + Args: + in_channel (int): Input channel + out_channel (int): Output channel + """ + + def __init__(self, in_channel=1024, out_channel=1024): super(RPNFeat, self).__init__() # rpn feat is shared with each level self.rpn_conv = nn.Conv2D( - in_channels=feat_in, - out_channels=feat_out, + in_channels=in_channel, + out_channels=out_channel, kernel_size=3, padding=1, weight_attr=paddle.ParamAttr(initializer=Normal( @@ -47,6 +55,20 @@ class RPNFeat(nn.Layer): @register class RPNHead(nn.Layer): + """ + Region Proposal Network + + Args: + anchor_generator (dict): configure of anchor generation + rpn_target_assign (dict): configure of rpn targets assignment + train_proposal (dict): configure of proposals generation + at the stage of training + test_proposal (dict): configure of proposals generation + at the stage of prediction + in_channel (int): channel of input feature maps which can be + derived by from_config + """ + def __init__(self, anchor_generator=AnchorGenerator().__dict__, rpn_target_assign=RPNTargetAssign().__dict__, diff --git a/ppdet/modeling/proposal_generator/target.py b/ppdet/modeling/proposal_generator/target.py index b4d490a52c4b525106b649c28c03c23f27ee7910..a783bbdb58586863dd4529400e1d67d94a128bd4 100644 --- a/ppdet/modeling/proposal_generator/target.py +++ b/ppdet/modeling/proposal_generator/target.py @@ -135,12 +135,15 @@ def generate_proposal_target(rpn_rois, tgt_gt_inds = [] new_rois_num = [] + # In cascade rcnn, the threshold for foreground and background + # is used from cascade_iou fg_thresh = cascade_iou if is_cascade else fg_thresh bg_thresh = cascade_iou if is_cascade else bg_thresh for i, rpn_roi in enumerate(rpn_rois): gt_bbox = gt_boxes[i] gt_class = paddle.squeeze(gt_classes[i], axis=-1) + # Concat RoIs and gt boxes except cascade rcnn if not is_cascade: bbox = paddle.concat([rpn_roi, gt_bbox]) else: @@ -247,10 +250,12 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, tgt_weights = [] for k in range(len(rois)): labels_per_im = labels_int32[k] + # select rois labeled with foreground fg_inds = paddle.nonzero( paddle.logical_and(labels_per_im != -1, labels_per_im != num_classes)) has_fg = True + # generate fake roi if foreground is empty if fg_inds.numel() == 0: has_fg = False fg_inds = paddle.ones([1], dtype='int32') @@ -259,6 +264,8 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, rois_per_im = rois[k] fg_rois = paddle.gather(rois_per_im, fg_inds) + # Copy the foreground roi to cpu + # to generate mask target with ground-truth boxes = fg_rois.numpy() gt_segms_per_im = gt_segms[k] new_segm = [] diff --git a/ppdet/modeling/proposal_generator/target_layer.py b/ppdet/modeling/proposal_generator/target_layer.py index 1087638b9300b3252dac16549e6a26f97349f700..6ad82dad156a7dba23b6a8638d9f99077a8b4010 100644 --- a/ppdet/modeling/proposal_generator/target_layer.py +++ b/ppdet/modeling/proposal_generator/target_layer.py @@ -22,6 +22,32 @@ from .target import rpn_anchor_target, generate_proposal_target, generate_mask_t @register @serializable class RPNTargetAssign(object): + """ + RPN targets assignment module + + The assignment consists of three steps: + 1. Match anchor and ground-truth box, label the anchor with foreground + or background sample + 2. Sample anchors to keep the properly ratio between foreground and + background + 3. Generate the targets for classification and regression branch + + + Args: + batch_size_per_im (int): Total number of RPN samples per image. + default 256 + fg_fraction (float): Fraction of anchors that is labeled + foreground, default 0.5 + positive_overlap (float): Minimum overlap required between an anchor + and ground-truth box for the (anchor, gt box) pair to be + a foreground sample. default 0.7 + negative_overlap (float): Maximum overlap allowed between an anchor + and ground-truth box for the (anchor, gt box) pair to be + a background sample. default 0.3 + use_random (bool): Use random sampling to choose foreground and + background boxes, default true. + """ + def __init__(self, batch_size_per_im=256, fg_fraction=0.5, @@ -54,6 +80,33 @@ class RPNTargetAssign(object): @register class BBoxAssigner(object): __shared__ = ['num_classes'] + """ + RCNN targets assignment module + + The assignment consists of three steps: + 1. Match RoIs and ground-truth box, label the RoIs with foreground + or background sample + 2. Sample anchors to keep the properly ratio between foreground and + background + 3. Generate the targets for classification and regression branch + + Args: + batch_size_per_im (int): Total number of RoIs per image. + default 512 + fg_fraction (float): Fraction of RoIs that is labeled + foreground, default 0.25 + positive_overlap (float): Minimum overlap required between a RoI + and ground-truth box for the (roi, gt box) pair to be + a foreground sample. default 0.5 + negative_overlap (float): Maximum overlap allowed between a RoI + and ground-truth box for the (roi, gt box) pair to be + a background sample. default 0.5 + use_random (bool): Use random sampling to choose foreground and + background boxes, default true + cascade_iou (list[iou]): The list of overlap to select foreground and + background of each stage, which is only used In Cascade RCNN. + num_classes (int): The number of class. + """ def __init__(self, batch_size_per_im=512, @@ -61,7 +114,6 @@ class BBoxAssigner(object): fg_thresh=.5, bg_thresh=.5, use_random=True, - is_cls_agnostic=False, cascade_iou=[0.5, 0.6, 0.7], num_classes=80): super(BBoxAssigner, self).__init__() @@ -70,7 +122,6 @@ class BBoxAssigner(object): self.fg_thresh = fg_thresh self.bg_thresh = bg_thresh self.use_random = use_random - self.is_cls_agnostic = is_cls_agnostic self.cascade_iou = cascade_iou self.num_classes = num_classes @@ -99,6 +150,18 @@ class BBoxAssigner(object): @serializable class MaskAssigner(object): __shared__ = ['num_classes', 'mask_resolution'] + """ + Mask targets assignment module + + The assignment consists of three steps: + 1. Select RoIs labels with foreground. + 2. Encode the RoIs and corresponding gt polygons to generate + mask target + + Args: + num_classes (int): The number of class + mask_resolution (int): The resolution of mask target, default 14 + """ def __init__(self, num_classes=80, mask_resolution=14): super(MaskAssigner, self).__init__()