add coments for rcnn (#2461)

988574fe · wangguanzhong · GitHub · 10d13c43 · 988574fe · 988574fe
21 changed file
--- a/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml
+++ b/configs/cascade_rcnn/_base_/cascade_mask_rcnn_r50_fpn.yml
@@ -64,7 +64,7 @@ BBoxAssigner:
  use_random: True

 CascadeTwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024

 BBoxPostProcess:
  decode:
@@ -88,7 +88,7 @@ MaskHead:

 MaskFeat:
  num_convs: 4
-  out_channels: 256
+  out_channel: 256

 MaskAssigner:
  mask_resolution: 28

--- a/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml
+++ b/configs/cascade_rcnn/_base_/cascade_rcnn_r50_fpn.yml
@@ -62,7 +62,7 @@ BBoxAssigner:
  use_random: True

 CascadeTwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024

 BBoxPostProcess:
  decode:

--- a/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
+++ b/configs/faster_rcnn/_base_/faster_rcnn_r50_fpn.yml
@@ -61,7 +61,7 @@ BBoxAssigner:
  use_random: True

 TwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024


 BBoxPostProcess:

--- a/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/cascade_mask_rcnn_r50_fpn_gn_2x_coco.yml
@@ -31,7 +31,7 @@ CascadeHead:

 CascadeXConvNormHead:
  num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
  norm_type: gn

 MaskHead:
@@ -45,7 +45,7 @@ MaskHead:

 MaskFeat:
  num_convs: 4
-  out_channels: 256
+  out_channel: 256
  norm_type: gn



--- a/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/cascade_rcnn_r50_fpn_gn_2x_coco.yml
@@ -21,7 +21,7 @@ CascadeHead:

 CascadeXConvNormHead:
  num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
  norm_type: gn



--- a/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/faster_rcnn_r50_fpn_gn_2x_coco.yml
@@ -29,7 +29,7 @@ BBoxHead:

 XConvNormHead:
  num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
  norm_type: gn



--- a/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml
+++ b/configs/gn/mask_rcnn_r50_fpn_gn_2x_coco.yml
@@ -31,7 +31,7 @@ BBoxHead:

 XConvNormHead:
  num_convs: 4
-  mlp_dim: 1024
+  out_channel: 1024
  norm_type: gn

 MaskHead:
@@ -45,7 +45,7 @@ MaskHead:

 MaskFeat:
  num_convs: 4
-  out_channels: 256
+  out_channel: 256
  norm_type: gn



--- a/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml
+++ b/configs/hrnet/_base_/faster_rcnn_hrnetv2p_w18.yml
@@ -57,7 +57,7 @@ BBoxAssigner:
  use_random: True

 TwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024

 BBoxPostProcess:
  decode: RCNNBox

--- a/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
+++ b/configs/mask_rcnn/_base_/mask_rcnn_r50.yml
@@ -78,7 +78,7 @@ MaskHead:

 MaskFeat:
  num_convs: 0
-  out_channels: 256
+  out_channel: 256

 MaskAssigner:
  mask_resolution: 14

--- a/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
+++ b/configs/mask_rcnn/_base_/mask_rcnn_r50_fpn.yml
@@ -61,7 +61,7 @@ BBoxAssigner:
  use_random: True

 TwoFCHead:
-  mlp_dim: 1024
+  out_channel: 1024

 BBoxPostProcess:
  decode: RCNNBox
@@ -82,7 +82,7 @@ MaskHead:

 MaskFeat:
  num_convs: 4
-  out_channels: 256
+  out_channel: 256

 MaskAssigner:
  mask_resolution: 28

--- a/ppdet/modeling/heads/bbox_head.py
+++ b/ppdet/modeling/heads/bbox_head.py
@@ -31,31 +31,40 @@ __all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead']

 @register
 class TwoFCHead(nn.Layer):
-    def __init__(self, in_dim=256, mlp_dim=1024, resolution=7):
+    """
+    RCNN bbox head with Two fc layers to extract feature
+
+    Args:
+        in_channel (int): Input channel which can be derived by from_config
+        out_channel (int): Output channel
+        resolution (int): Resolution of input feature map, default 7
+    """
+
+    def __init__(self, in_channel=256, out_channel=1024, resolution=7):
        super(TwoFCHead, self).__init__()
-        self.in_dim = in_dim
-        self.mlp_dim = mlp_dim
-        fan = in_dim * resolution * resolution
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        fan = in_channel * resolution * resolution
        self.fc6 = nn.Linear(
-            in_dim * resolution * resolution,
-            mlp_dim,
+            in_channel * resolution * resolution,
+            out_channel,
            weight_attr=paddle.ParamAttr(
                initializer=XavierUniform(fan_out=fan)))

        self.fc7 = nn.Linear(
-            mlp_dim,
-            mlp_dim,
+            out_channel,
+            out_channel,
            weight_attr=paddle.ParamAttr(initializer=XavierUniform()))

    @classmethod
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}

    @property
    def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat):
        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
@@ -68,34 +77,36 @@ class TwoFCHead(nn.Layer):

 @register
 class XConvNormHead(nn.Layer):
+    __shared__ = ['norm_type', 'freeze_norm']
    """
    RCNN bbox head with serveral convolution layers
+
    Args:
-        in_dim(int): num of channels for the input rois_feat
-        num_convs(int): num of convolution layers for the rcnn bbox head
-        conv_dim(int): num of channels for the conv layers
-        mlp_dim(int): num of channels for the fc layers
-        resolution(int): resolution of the rois_feat
-        norm_type(str): norm type, 'gn' by defalut
-        freeze_norm(bool): whether to freeze the norm
-        stage_name(str): used in CascadeXConvNormHead, '' by default
+        in_channel (int): Input channels which can be derived by from_config
+        num_convs (int): The number of conv layers
+        conv_dim (int): The number of channels for the conv layers
+        out_channel (int): Output channels
+        resolution (int): Resolution of input feature map
+        norm_type (string): Norm type, bn, gn, sync_bn are available, 
+            default `gn`
+        freeze_norm (bool): Whether to freeze the norm
+        stage_name (string): Prefix name for conv layer,  '' by default
    """
-    __shared__ = ['norm_type', 'freeze_norm']

    def __init__(self,
-                 in_dim=256,
+                 in_channel=256,
                 num_convs=4,
                 conv_dim=256,
-                 mlp_dim=1024,
+                 out_channel=1024,
                 resolution=7,
                 norm_type='gn',
                 freeze_norm=False,
                 stage_name=''):
        super(XConvNormHead, self).__init__()
-        self.in_dim = in_dim
+        self.in_channel = in_channel
        self.num_convs = num_convs
        self.conv_dim = conv_dim
-        self.mlp_dim = mlp_dim
+        self.out_channel = out_channel
        self.norm_type = norm_type
        self.freeze_norm = freeze_norm

@@ -103,7 +114,7 @@ class XConvNormHead(nn.Layer):
        fan = conv_dim * 3 * 3
        initializer = KaimingNormal(fan_in=fan)
        for i in range(self.num_convs):
-            in_c = in_dim if i == 0 else conv_dim
+            in_c = in_channel if i == 0 else conv_dim
            head_conv_name = stage_name + 'bbox_head_conv{}'.format(i)
            head_conv = self.add_sublayer(
                head_conv_name,
@@ -122,7 +133,7 @@ class XConvNormHead(nn.Layer):
        fan = conv_dim * resolution * resolution
        self.fc6 = nn.Linear(
            conv_dim * resolution * resolution,
-            mlp_dim,
+            out_channel,
            weight_attr=paddle.ParamAttr(
                initializer=XavierUniform(fan_out=fan)),
            bias_attr=paddle.ParamAttr(
@@ -132,11 +143,11 @@ class XConvNormHead(nn.Layer):
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}

    @property
    def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat):
        for i in range(self.num_convs):
@@ -151,6 +162,9 @@ class BBoxHead(nn.Layer):
    __shared__ = ['num_classes']
    __inject__ = ['bbox_assigner']
    """
+    RCNN bbox head
+
+    Args:
        head (nn.Layer): Extract feature in bbox head
        in_channel (int): Input channel after RoI extractor
        roi_extractor (object): The module of RoI Extractor

--- a/ppdet/modeling/heads/cascade_head.py
+++ b/ppdet/modeling/heads/cascade_head.py
@@ -32,32 +32,41 @@ __all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead']
 @register
 class CascadeTwoFCHead(nn.Layer):
    __shared__ = ['num_cascade_stage']
+    """
+    Cascade RCNN bbox head  with Two fc layers to extract feature
+
+    Args:
+        in_channel (int): Input channel which can be derived by from_config
+        out_channel (int): Output channel
+        resolution (int): Resolution of input feature map, default 7
+        num_cascade_stage (int): The number of cascade stage, default 3
+    """

    def __init__(self,
-                 in_dim=256,
-                 mlp_dim=1024,
+                 in_channel=256,
+                 out_channel=1024,
                 resolution=7,
                 num_cascade_stage=3):
        super(CascadeTwoFCHead, self).__init__()

-        self.in_dim = in_dim
-        self.mlp_dim = mlp_dim
+        self.in_channel = in_channel
+        self.out_channel = out_channel

        self.head_list = []
        for stage in range(num_cascade_stage):
            head_per_stage = self.add_sublayer(
-                str(stage), TwoFCHead(in_dim, mlp_dim, resolution))
+                str(stage), TwoFCHead(in_channel, out_channel, resolution))
            self.head_list.append(head_per_stage)

    @classmethod
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}

    @property
    def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat, stage=0):
        out = self.head_list[stage](rois_feat)
@@ -67,29 +76,43 @@ class CascadeTwoFCHead(nn.Layer):
 @register
 class CascadeXConvNormHead(nn.Layer):
    __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage']
+    """
+    Cascade RCNN bbox head with serveral convolution layers
+
+    Args:
+        in_channel (int): Input channels which can be derived by from_config
+        num_convs (int): The number of conv layers
+        conv_dim (int): The number of channels for the conv layers
+        out_channel (int): Output channels
+        resolution (int): Resolution of input feature map
+        norm_type (string): Norm type, bn, gn, sync_bn are available, 
+            default `gn`
+        freeze_norm (bool): Whether to freeze the norm
+        num_cascade_stage (int): The number of cascade stage, default 3
+    """

    def __init__(self,
-                 in_dim=256,
+                 in_channel=256,
                 num_convs=4,
                 conv_dim=256,
-                 mlp_dim=1024,
+                 out_channel=1024,
                 resolution=7,
                 norm_type='gn',
                 freeze_norm=False,
                 num_cascade_stage=3):
        super(CascadeXConvNormHead, self).__init__()
-        self.in_dim = in_dim
-        self.mlp_dim = mlp_dim
+        self.in_channel = in_channel
+        self.out_channel = out_channel

        self.head_list = []
        for stage in range(num_cascade_stage):
            head_per_stage = self.add_sublayer(
                str(stage),
                XConvNormHead(
-                    in_dim,
+                    in_channel,
                    num_convs,
                    conv_dim,
-                    mlp_dim,
+                    out_channel,
                    resolution,
                    norm_type,
                    freeze_norm,
@@ -100,11 +123,11 @@ class CascadeXConvNormHead(nn.Layer):
    def from_config(cls, cfg, input_shape):
        s = input_shape
        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_dim': s.channels}
+        return {'in_channel': s.channels}

    @property
    def out_shape(self):
-        return [ShapeSpec(channels=self.mlp_dim, )]
+        return [ShapeSpec(channels=self.out_channel, )]

    def forward(self, rois_feat, stage=0):
        out = self.head_list[stage](rois_feat)
@@ -116,6 +139,9 @@ class CascadeHead(BBoxHead):
    __shared__ = ['num_classes', 'num_cascade_stages']
    __inject__ = ['bbox_assigner']
    """
+    Cascade RCNN bbox head
+
+    Args:
        head (nn.Layer): Extract feature in bbox head
        in_channel (int): Input channel after RoI extractor
        roi_extractor (object): The module of RoI Extractor
@@ -123,8 +149,7 @@ class CascadeHead(BBoxHead):
            box.
        num_classes (int): The number of classes
        bbox_weight (List[List[float]]): The weight to get the decode box and the 
-                                     length of weight is the number of cascade 
-                                     stage
+            length of weight is the number of cascade stage
        num_cascade_stages (int): THe number of stage to refine the box
    """


--- a/ppdet/modeling/heads/mask_head.py
+++ b/ppdet/modeling/heads/mask_head.py
@@ -27,18 +27,29 @@ from .roi_extractor import RoIAlign

 @register
 class MaskFeat(nn.Layer):
+    """
+    Feature extraction in Mask head
+
+    Args:
+        in_channel (int): Input channels
+        out_channel (int): Output channels
+        num_convs (int): The number of conv layers, default 4
+        norm_type (string | None): Norm type, bn, gn, sync_bn are available,
+            default None
+    """
+
    def __init__(self,
+                 in_channel=256,
+                 out_channel=256,
                 num_convs=4,
-                 in_channels=256,
-                 out_channels=256,
                 norm_type=None):
        super(MaskFeat, self).__init__()
        self.num_convs = num_convs
-        self.in_channels = in_channels
-        self.out_channels = out_channels
+        self.in_channel = in_channel
+        self.out_channel = out_channel
        self.norm_type = norm_type
-        fan_conv = out_channels * 3 * 3
-        fan_deconv = out_channels * 2 * 2
+        fan_conv = out_channel * 3 * 3
+        fan_deconv = out_channel * 2 * 2

        mask_conv = nn.Sequential()
        if norm_type == 'gn':
@@ -47,8 +58,8 @@ class MaskFeat(nn.Layer):
                mask_conv.add_sublayer(
                    conv_name,
                    ConvNormLayer(
-                        ch_in=in_channels if i == 0 else out_channels,
-                        ch_out=out_channels,
+                        ch_in=in_channel if i == 0 else out_channel,
+                        ch_out=out_channel,
                        filter_size=3,
                        stride=1,
                        norm_type=self.norm_type,
@@ -62,8 +73,8 @@ class MaskFeat(nn.Layer):
                mask_conv.add_sublayer(
                    conv_name,
                    nn.Conv2D(
-                        in_channels=in_channels if i == 0 else out_channels,
-                        out_channels=out_channels,
+                        in_channels=in_channel if i == 0 else out_channel,
+                        out_channels=out_channel,
                        kernel_size=3,
                        padding=1,
                        weight_attr=paddle.ParamAttr(
@@ -72,8 +83,8 @@ class MaskFeat(nn.Layer):
        mask_conv.add_sublayer(
            'conv5_mask',
            nn.Conv2DTranspose(
-                in_channels=self.in_channels,
-                out_channels=self.out_channels,
+                in_channels=self.in_channel,
+                out_channels=self.out_channel,
                kernel_size=2,
                stride=2,
                weight_attr=paddle.ParamAttr(
@@ -85,10 +96,10 @@ class MaskFeat(nn.Layer):
    def from_config(cls, cfg, input_shape):
        if isinstance(input_shape, (list, tuple)):
            input_shape = input_shape[0]
-        return {'in_channels': input_shape.channels, }
+        return {'in_channel': input_shape.channels, }

-    def out_channel(self):
-        return self.out_channels
+    def out_channels(self):
+        return self.out_channel

    def forward(self, feats):
        return self.upsample(feats)
@@ -98,6 +109,18 @@ class MaskFeat(nn.Layer):
 class MaskHead(nn.Layer):
    __shared__ = ['num_classes']
    __inject__ = ['mask_assigner']
+    """
+    RCNN mask head
+
+    Args:
+        head (nn.Layer): Extract feature in mask head
+        roi_extractor (object): The module of RoI Extractor
+        mask_assigner (object): The module of Mask Assigner, 
+            label and sample the mask
+        num_classes (int): The number of classes
+        share_bbox_feat (bool): Whether to share the feature from bbox head,
+            default false
+    """

    def __init__(self,
                 head,
@@ -112,7 +135,7 @@ class MaskHead(nn.Layer):
        if isinstance(roi_extractor, dict):
            self.roi_extractor = RoIAlign(**roi_extractor)
        self.head = head
-        self.in_channels = head.out_channel()
+        self.in_channels = head.out_channels()
        self.mask_assigner = mask_assigner
        self.share_bbox_feat = share_bbox_feat
        self.bbox_head = None
@@ -159,7 +182,6 @@ class MaskHead(nn.Layer):
        rois_num (Tensor): The number of proposals for each batch
        inputs (dict): ground truth info
        """
-        #assert self.bbox_head
        tgt_labels, _, tgt_gt_inds = targets
        rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(
            rois, tgt_labels, tgt_gt_inds, inputs)

--- a/ppdet/modeling/heads/roi_extractor.py
+++ b/ppdet/modeling/heads/roi_extractor.py
@@ -25,6 +25,31 @@ def _to_list(v):

 @register
 class RoIAlign(object):
+    """
+    RoI Align module
+
+    For more details, please refer to the document of roi_align in
+    in ppdet/modeing/ops.py
+
+    Args:
+        resolution (int): The output size, default 14
+        spatial_scale (float): Multiplicative spatial scale factor to translate
+            ROI coords from their input scale to the scale used when pooling.
+            default 0.0625
+        sampling_ratio (int): The number of sampling points in the interpolation
+            grid, default 0
+        canconical_level (int): The referring level of FPN layer with 
+            specified level. default 4
+        canonical_size (int): The referring scale of FPN layer with 
+            specified scale. default 224
+        start_level (int): The start level of FPN layer to extract RoI feature,
+            default 0
+        end_level (int): The end level of FPN layer to extract RoI feature,
+            default 3
+        aligned (bool): Whether to add offset to rois' coord in roi_align.
+            default false
+    """
+
    def __init__(self,
                 resolution=14,
                 spatial_scale=0.0625,

--- a/ppdet/modeling/heads/rpn_head.py
+++ b/ppdet/modeling/heads/rpn_head.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal
-from paddle.regularizer import L2Decay
-from paddle.nn import Conv2D
-
-from ppdet.core.workspace import register
-from ppdet.modeling import ops
-
-
-@register
-class RPNFeat(nn.Layer):
-    def __init__(self, feat_in=1024, feat_out=1024):
-        super(RPNFeat, self).__init__()
-        # rpn feat is shared with each level
-        self.rpn_conv = Conv2D(
-            in_channels=feat_in,
-            out_channels=feat_out,
-            kernel_size=3,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-    def forward(self, inputs, feats):
-        rpn_feats = []
-        for feat in feats:
-            rpn_feats.append(F.relu(self.rpn_conv(feat)))
-        return rpn_feats
-
-
-@register
-class RPNHead(nn.Layer):
-    __inject__ = ['rpn_feat']
-
-    def __init__(self, rpn_feat, anchor_per_position=15, rpn_channel=1024):
-        super(RPNHead, self).__init__()
-        self.rpn_feat = rpn_feat
-        if isinstance(rpn_feat, dict):
-            self.rpn_feat = RPNFeat(**rpn_feat)
-        # rpn head is shared with each level
-        # rpn roi classification scores
-        self.rpn_rois_score = Conv2D(
-            in_channels=rpn_channel,
-            out_channels=anchor_per_position,
-            kernel_size=1,
-            padding=0,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-        # rpn roi bbox regression deltas
-        self.rpn_rois_delta = Conv2D(
-            in_channels=rpn_channel,
-            out_channels=4 * anchor_per_position,
-            kernel_size=1,
-            padding=0,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-    def forward(self, inputs, feats):
-        rpn_feats = self.rpn_feat(inputs, feats)
-        rpn_head_out = []
-        for rpn_feat in rpn_feats:
-            rrs = self.rpn_rois_score(rpn_feat)
-            rrd = self.rpn_rois_delta(rpn_feat)
-            rpn_head_out.append((rrs, rrd))
-        return rpn_feats, rpn_head_out
-
-    def get_loss(self, loss_inputs):
-        # cls loss
-        score_tgt = paddle.cast(
-            x=loss_inputs['rpn_score_target'], dtype='float32')
-        score_tgt.stop_gradient = True
-        loss_rpn_cls = ops.sigmoid_cross_entropy_with_logits(
-            input=loss_inputs['rpn_score_pred'], label=score_tgt)
-        loss_rpn_cls = paddle.mean(loss_rpn_cls, name='loss_rpn_cls')
-
-        # reg loss
-        loc_tgt = paddle.cast(x=loss_inputs['rpn_rois_target'], dtype='float32')
-        loc_tgt.stop_gradient = True
-        loss_rpn_reg = ops.smooth_l1(
-            input=loss_inputs['rpn_rois_pred'],
-            label=loc_tgt,
-            inside_weight=loss_inputs['rpn_rois_weight'],
-            outside_weight=loss_inputs['rpn_rois_weight'],
-            sigma=3.0, )
-        loss_rpn_reg = paddle.sum(loss_rpn_reg)
-        score_shape = paddle.shape(score_tgt)
-        score_shape = paddle.cast(score_shape, dtype='float32')
-        norm = paddle.prod(score_shape)
-        norm.stop_gradient = True
-        loss_rpn_reg = loss_rpn_reg / norm
-
-        return {'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_reg': loss_rpn_reg}
--- a/ppdet/modeling/necks/fpn.py
+++ b/ppdet/modeling/necks/fpn.py
@@ -29,6 +29,34 @@ __all__ = ['FPN']
 @register
 @serializable
 class FPN(nn.Layer):
+    """
+    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
+
+    Args:
+        in_channels (list[int]): input channels of each level which can be 
+            derived from the output shape of backbone by from_config
+        out_channel (list[int]): output channel of each level
+        spatial_scales (list[float]): the spatial scales between input feature
+            maps and original input image which can be derived from the output 
+            shape of backbone by from_config
+        has_extra_convs (bool): whether to add extra conv to the last level.
+            default False
+        extra_stage (int): the number of extra stages added to the last level.
+            default 1
+        use_c5 (bool): Whether to use c5 as the input of extra stage, 
+            otherwise p5 is used. default True
+        norm_type (string|None): The normalization type in FPN module. If 
+            norm_type is None, norm will not be used after conv and if 
+            norm_type is string, bn, gn, sync_bn are available. default None
+        norm_decay (float): weight decay for normalization layer weights.
+            default 0.
+        freeze_norm (bool): whether to freeze normalization layer.  
+            default False
+        relu_before_extra_convs (bool): whether to add relu before extra convs.
+            default False
+        
+    """
+
    def __init__(self,
                 in_channels,
                 out_channel,
@@ -67,7 +95,7 @@ class FPN(nn.Layer):
            else:
                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
            in_c = in_channels[i - st_stage]
-            if self.norm_type == 'gn':
+            if self.norm_type is not None:
                lateral = self.add_sublayer(
                    lateral_name,
                    ConvNormLayer(
@@ -93,7 +121,7 @@ class FPN(nn.Layer):
            self.lateral_convs.append(lateral)

            fpn_name = 'fpn_res{}_sum'.format(i + 2)
-            if self.norm_type == 'gn':
+            if self.norm_type is not None:
                fpn_conv = self.add_sublayer(
                    fpn_name,
                    ConvNormLayer(
@@ -128,7 +156,7 @@ class FPN(nn.Layer):
                else:
                    in_c = out_channel
                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
-                if self.norm_type == 'gn':
+                if self.norm_type is not None:
                    extra_fpn_conv = self.add_sublayer(
                        extra_fpn_name,
                        ConvNormLayer(

--- a/ppdet/modeling/proposal_generator/anchor_generator.py
+++ b/ppdet/modeling/proposal_generator/anchor_generator.py
@@ -25,6 +25,24 @@ from .. import ops

 @register
 class AnchorGenerator(nn.Layer):
+    """
+    Generate anchors according to the feature maps
+
+    Args:
+        anchor_sizes (list[float] | list[list[float]]): The anchor sizes at 
+            each feature point. list[float] means all feature levels share the 
+            same sizes. list[list[float]] means the anchor sizes for 
+            each level. The sizes stand for the scale of input size.
+        aspect_ratios (list[float] | list[list[float]]): The aspect ratios at
+            each feature point. list[float] means all feature levels share the
+            same ratios. list[list[float]] means the aspect ratios for
+            each level.
+        strides (list[float]): The strides of feature maps which generate 
+            anchors
+        offset (float): The offset of the coordinate of anchors, default 0.
+        
+    """
+
    def __init__(self,
                 anchor_sizes=[32, 64, 128, 256, 512],
                 aspect_ratios=[0.5, 1.0, 2.0],

--- a/ppdet/modeling/proposal_generator/proposal_generator.py
+++ b/ppdet/modeling/proposal_generator/proposal_generator.py
@@ -25,6 +25,28 @@ from .. import ops
 @register
 @serializable
 class ProposalGenerator(object):
+    """
+    Proposal generation module
+
+    For more details, please refer to the document of generate_proposals 
+    in ppdet/modeing/ops.py
+
+    Args:
+        pre_nms_top_n (int): Number of total bboxes to be kept per
+            image before NMS. default 6000
+        post_nms_top_n (int): Number of total bboxes to be kept per
+            image after NMS. default 1000
+        nms_thresh (float): Threshold in NMS. default 0.5
+        min_size (flaot): Remove predicted boxes with either height or
+             width < min_size. default 0.1
+        eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
+             `adaptive_threshold = adaptive_threshold * eta` in each iteration.
+             default 1.
+        topk_after_collect (bool): whether to adopt topk after batch 
+             collection. If topk_after_collect is true, box filter will not be 
+             used after NMS at each image in proposal generation. default false
+    """
+
    def __init__(self,
                 pre_nms_top_n=12000,
                 post_nms_top_n=2000,

--- a/ppdet/modeling/proposal_generator/rpn_head.py
+++ b/ppdet/modeling/proposal_generator/rpn_head.py
@@ -27,12 +27,20 @@ from .proposal_generator import ProposalGenerator


 class RPNFeat(nn.Layer):
-    def __init__(self, feat_in=1024, feat_out=1024):
+    """
+    Feature extraction in RPN head
+
+    Args:
+        in_channel (int): Input channel
+        out_channel (int): Output channel
+    """
+
+    def __init__(self, in_channel=1024, out_channel=1024):
        super(RPNFeat, self).__init__()
        # rpn feat is shared with each level
        self.rpn_conv = nn.Conv2D(
-            in_channels=feat_in,
-            out_channels=feat_out,
+            in_channels=in_channel,
+            out_channels=out_channel,
            kernel_size=3,
            padding=1,
            weight_attr=paddle.ParamAttr(initializer=Normal(
@@ -47,6 +55,20 @@ class RPNFeat(nn.Layer):

 @register
 class RPNHead(nn.Layer):
+    """
+    Region Proposal Network
+
+    Args:
+        anchor_generator (dict): configure of anchor generation
+        rpn_target_assign (dict): configure of rpn targets assignment
+        train_proposal (dict): configure of proposals generation 
+            at the stage of training
+        test_proposal (dict): configure of proposals generation
+            at the stage of prediction
+        in_channel (int): channel of input feature maps which can be 
+            derived by from_config
+    """
+
    def __init__(self,
                 anchor_generator=AnchorGenerator().__dict__,
                 rpn_target_assign=RPNTargetAssign().__dict__,

--- a/ppdet/modeling/proposal_generator/target.py
+++ b/ppdet/modeling/proposal_generator/target.py
@@ -135,12 +135,15 @@ def generate_proposal_target(rpn_rois,
    tgt_gt_inds = []
    new_rois_num = []

+    # In cascade rcnn, the threshold for foreground and background
+    # is used from cascade_iou
    fg_thresh = cascade_iou if is_cascade else fg_thresh
    bg_thresh = cascade_iou if is_cascade else bg_thresh
    for i, rpn_roi in enumerate(rpn_rois):
        gt_bbox = gt_boxes[i]
        gt_class = paddle.squeeze(gt_classes[i], axis=-1)

+        # Concat RoIs and gt boxes except cascade rcnn
        if not is_cascade:
            bbox = paddle.concat([rpn_roi, gt_bbox])
        else:
@@ -247,10 +250,12 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,
    tgt_weights = []
    for k in range(len(rois)):
        labels_per_im = labels_int32[k]
+        # select rois labeled with foreground
        fg_inds = paddle.nonzero(
            paddle.logical_and(labels_per_im != -1, labels_per_im !=
                               num_classes))
        has_fg = True
+        # generate fake roi if foreground is empty
        if fg_inds.numel() == 0:
            has_fg = False
            fg_inds = paddle.ones([1], dtype='int32')
@@ -259,6 +264,8 @@ def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,

        rois_per_im = rois[k]
        fg_rois = paddle.gather(rois_per_im, fg_inds)
+        # Copy the foreground roi to cpu
+        # to generate mask target with ground-truth
        boxes = fg_rois.numpy()
        gt_segms_per_im = gt_segms[k]
        new_segm = []

--- a/ppdet/modeling/proposal_generator/target_layer.py
+++ b/ppdet/modeling/proposal_generator/target_layer.py
@@ -22,6 +22,32 @@ from .target import rpn_anchor_target, generate_proposal_target, generate_mask_t
 @register
 @serializable
 class RPNTargetAssign(object):
+    """
+    RPN targets assignment module
+
+    The assignment consists of three steps:
+        1. Match anchor and ground-truth box, label the anchor with foreground
+           or background sample
+        2. Sample anchors to keep the properly ratio between foreground and 
+           background
+        3. Generate the targets for classification and regression branch
+
+
+    Args:
+        batch_size_per_im (int): Total number of RPN samples per image. 
+            default 256
+        fg_fraction (float): Fraction of anchors that is labeled
+            foreground, default 0.5
+        positive_overlap (float): Minimum overlap required between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be 
+            a foreground sample. default 0.7
+        negative_overlap (float): Maximum overlap allowed between an anchor
+            and ground-truth box for the (anchor, gt box) pair to be 
+            a background sample. default 0.3
+        use_random (bool): Use random sampling to choose foreground and 
+            background boxes, default true.
+    """
+
    def __init__(self,
                 batch_size_per_im=256,
                 fg_fraction=0.5,
@@ -54,6 +80,33 @@ class RPNTargetAssign(object):
 @register
 class BBoxAssigner(object):
    __shared__ = ['num_classes']
+    """
+    RCNN targets assignment module
+
+    The assignment consists of three steps:
+        1. Match RoIs and ground-truth box, label the RoIs with foreground
+           or background sample
+        2. Sample anchors to keep the properly ratio between foreground and 
+           background
+        3. Generate the targets for classification and regression branch
+
+    Args:
+        batch_size_per_im (int): Total number of RoIs per image. 
+            default 512 
+        fg_fraction (float): Fraction of RoIs that is labeled
+            foreground, default 0.25
+        positive_overlap (float): Minimum overlap required between a RoI
+            and ground-truth box for the (roi, gt box) pair to be 
+            a foreground sample. default 0.5
+        negative_overlap (float): Maximum overlap allowed between a RoI 
+            and ground-truth box for the (roi, gt box) pair to be 
+            a background sample. default 0.5
+        use_random (bool): Use random sampling to choose foreground and 
+            background boxes, default true
+        cascade_iou (list[iou]): The list of overlap to select foreground and 
+            background of each stage, which is only used In Cascade RCNN.
+        num_classes (int): The number of class.
+    """

    def __init__(self,
                 batch_size_per_im=512,
@@ -61,7 +114,6 @@ class BBoxAssigner(object):
                 fg_thresh=.5,
                 bg_thresh=.5,
                 use_random=True,
-                 is_cls_agnostic=False,
                 cascade_iou=[0.5, 0.6, 0.7],
                 num_classes=80):
        super(BBoxAssigner, self).__init__()
@@ -70,7 +122,6 @@ class BBoxAssigner(object):
        self.fg_thresh = fg_thresh
        self.bg_thresh = bg_thresh
        self.use_random = use_random
-        self.is_cls_agnostic = is_cls_agnostic
        self.cascade_iou = cascade_iou
        self.num_classes = num_classes

@@ -99,6 +150,18 @@ class BBoxAssigner(object):
 @serializable
 class MaskAssigner(object):
    __shared__ = ['num_classes', 'mask_resolution']
+    """
+    Mask targets assignment module
+
+    The assignment consists of three steps:
+        1. Select RoIs labels with foreground.
+        2. Encode the RoIs and corresponding gt polygons to generate 
+           mask target
+
+    Args:
+        num_classes (int): The number of class
+        mask_resolution (int): The resolution of mask target, default 14
+    """

    def __init__(self, num_classes=80, mask_resolution=14):
        super(MaskAssigner, self).__init__()