Fix english doc api, invloves the op of retinanet_target_assign,...

Fix english doc api, invloves the op of retinanet_target_assign, sigmoid_focal_loss and retinanet_detection_output, cherry-pick, (#20419) test=release/1.6 test=document_fix

Fix english doc api, invloves the op of retinanet_target_assign,...
Fix english doc api, invloves the op of retinanet_target_assign, sigmoid_focal_loss and retinanet_detection_output, cherry-pick, (#20419) test=release/1.6 test=document_fix
cf8cfb2a · FlyingQianMM · GitHub · c7882f91 · cf8cfb2a · cf8cfb2a
隐藏空白更改
内联并排

Showing with 271 addition and 161 deletion

paddle/fluid/API.spec paddle/fluid/API.spec +3 -3

python/paddle/fluid/layers/detection.py python/paddle/fluid/layers/detection.py +268 -158

未找到文件。
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -417,8 +417,8 @@ paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'ne
 paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40'))
 paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b'))
 paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', 'd46629656b4ce9b07809e32c0482cbef'))
-paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595'))
+paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', '543b2a40641260e745a76b1f7a25fb2a'))
-paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d'))
+paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', '4702891755596c8853aaeb874a5fdb46'))
 paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', 'a7778d4f557c60dca52321673667690d'))
 paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1'))
 paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'f2342042127b536a0a16390f149f1bba'))
@@ -432,7 +432,7 @@ paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_n
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ce2bfbd685f2a36eda400e00569908cb'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'c01ac2f1fced1ddd98574e71e877a6c2'))
 paddle.fluid.layers.multiclass_nms2 (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'return_index', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, False, None)), ('document', 'be156186ee7a2ee56ab30b964acb15e5'))
-paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8'))
+paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '488d24c6bd767b8c4422521f15d86c66'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1f2b6bfb3027ea63ab86859391f45b03'))
 paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8874f917b4da34541efe427841a8f205'))
 paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ff4a651d65a9a9f9da71349ba6a2dc1f'))

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -73,100 +73,166 @@ def retinanet_target_assign(bbox_pred,
                            positive_overlap=0.5,
                            negative_overlap=0.4):
    """
-    **Target Assign Layer for Retinanet .**
+    **Target Assign Layer for the detector RetinaNet.**
+    This OP finds out positive and negative samples from all anchors
+    for training the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ ,
+    and assigns target labels for classification along with target locations for
+    regression to each sample, then takes out the part belonging to positive and
+    negative samples from category prediction( :attr:`cls_logits`) and location
+    prediction( :attr:`bbox_pred`) which belong to all anchors.
+    The searching principles for positive and negative samples are as followed:
+    1. Anchors are assigned to ground-truth boxes when it has the highest IoU
+    overlap with a ground-truth box.
+    2. Anchors are assigned to ground-truth boxes when it has an IoU overlap
+    higher than :attr:`positive_overlap` with any ground-truth box.
+    3. Anchors are assigned to background when its IoU overlap is lower than
+    :attr:`negative_overlap` for all ground-truth boxes.
+    4. Anchors which do not meet the above conditions do not participate in
+    the training process.
+    Retinanet predicts a :math:`C`-vector for classification and a 4-vector for box
+    regresion for each anchor, hence the target label for each positive(or negative)
+    sample is a :math:`C`-vector and the target locations for each positive sample
+    is a 4-vector. As for a positive sample, if the category of its assigned
+    ground-truth box is class :math:`i`, the corresponding entry in its length
+    :math:`C` label vector is set to 1 and all other entries is set to 0, its box
+    regression targets are computed as the offset between itself and its assigned
+    ground-truth box. As for a negative sample, all entries in its length :math:`C`
+    label vector are set to 0 and box regression targets are omitted because
+    negative samples do not participate in the training process of location
+    regression.
+    After the assignment, the part belonging to positive and negative samples is
+    taken out from category prediction( :attr:`cls_logits` ), and the part
+    belonging to positive samples is taken out from location
+    prediction( :attr:`bbox_pred` ).
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for training
-    retinanet. Every anchor is assigned with a length :attr:`num_classes`
-    one-hot vector of classification targets, and a 4-vector of box regression
-    targets. The assignment rules are as followed:
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
    Args:
-        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+        bbox_pred(Variable): A 3-D Tensor with shape :math:`[N, M, 4]` represents
-            predicted locations of M bounding bboxes. N is the batch size,
+            the predicted locations of all anchors. :math:`N` is the batch size( the
-            and each bounding box has four coordinate values and the layout
+            number of images in a mini-batch), :math:`M` is the number of all anchors
-            is [xmin, ymin, xmax, ymax].
+            of one image, and each anchor has 4 coordinate values. The data type of
-        cls_logits(Variable): A 3-D Tensor with shape [N, M, C] represents the
+            :attr:`bbox_pred` is float32 or float64.
-            predicted confidence predictions. N is the batch size, C is the
+        cls_logits(Variable): A 3-D Tensor with shape :math:`[N, M, C]` represents
-            number of classes (excluding background), M is number of bounding boxes.
+            the predicted categories of all anchors. :math:`N` is the batch size,
-        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
+            :math:`M` is the number of all anchors of one image, and :math:`C` is
-            each box is represented as [xmin, ymin, xmax, ymax],
+            the number of categories (**Notice: excluding background**). The data type
-            [xmin, ymin] is the left top coordinate of the anchor box,
+            of :attr:`cls_logits` is float32 or float64.
-            if the input is image feature map, they are close to the origin
+        anchor_box(Variable): A 2-D Tensor with shape :math:`[M, 4]` represents
-            of the coordinate system. [xmax, ymax] is the right bottom
+            the locations of all anchors. :math:`M` is the number of all anchors of
-            coordinate of the anchor box.
+            one image, each anchor is represented as :math:`[xmin, ymin, xmax, ymax]`,
-        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
+            :math:`[xmin, ymin]` is the left top coordinate of the anchor box,
-            variances of anchors.
+            :math:`[xmax, ymax]` is the right bottom coordinate of the anchor box.
-        gt_boxes(Variable): The ground-truth bounding boxes (bboxes) are a 2D
+            The data type of :attr:`anchor_box` is float32 or float64. Please refer
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            to the OP :ref:`api_fluid_layers_anchor_generator` 
-            bboxes of mini-batch input.
+            for the generation of :attr:`anchor_box`.
-        gt_labels(variable): The ground-truth labels are a 2D LoDTensor with
+        anchor_var(Variable): A 2-D Tensor with shape :math:`[M,4]` represents the expanded 
-            shape [Ng, 1], Ng is the total number of ground-truth labels of
+            factors of anchor locations used in loss function. :math:`M` is number of
-            mini-batch input.
+            all anchors of one image, each anchor possesses a 4-vector expanded factor.
-        is_crowd(Variable): A 1-D LoDTensor which indicates ground-truth is crowd.
+            The data type of :attr:`anchor_var` is float32 or float64. Please refer
-        im_info(Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
+            to the OP :ref:`api_fluid_layers_anchor_generator`
-            3 is the height, width and scale.
+            for the generation of :attr:`anchor_var`.
-        num_classes(int32): The number of classes.
+        gt_boxes(Variable): A 1-level 2-D LoDTensor with shape :math:`[G, 4]` represents
-        positive_overlap(float): Minimum overlap required between an anchor
+            locations of all ground-truth boxes. :math:`G` is the total number of
-            and ground-truth box for the (anchor, gt box) pair to be a positive
+            all ground-truth boxes in a mini-batch, and each ground-truth box has 4
-            example.
+            coordinate values. The data type of :attr:`gt_boxes` is float32 or
-        negative_overlap(float): Maximum overlap allowed between an anchor
+            float64.
-            and ground-truth box for the (anchor, gt box) pair to be a negative
+        gt_labels(variable): A 1-level 2-D LoDTensor with shape :math:`[G, 1]` represents
-            examples.
+            categories of all ground-truth boxes, and the values are in the range of
+            :math:`[1, C]`. :math:`G` is the total number of all ground-truth boxes
+            in a mini-batch, and each ground-truth box has one category. The data type
+            of :attr:`gt_labels` is int32.
+        is_crowd(Variable): A 1-level 1-D LoDTensor with shape :math:`[G]` which
+            indicates whether a ground-truth box is a crowd. If the value is 1, the
+            corresponding box is a crowd, it is ignored during training. :math:`G` is
+            the total number of all ground-truth boxes in a mini-batch. The data type
+            of :attr:`is_crowd` is int32.
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents the size
+            information of input images. :math:`N` is the batch size, the size
+            informarion of each image is a 3-vector which are the height and width
+            of the network input along with the factor scaling the origin image to
+            the network input. The data type of :attr:`im_info` is float32.
+        num_classes(int32): The number of categories for classification, the default
+            value is 1.
+        positive_overlap(float32): Minimum overlap required between an anchor
+            and ground-truth box for the anchor to be a positive sample, the default
+            value is 0.5.
+        negative_overlap(float32): Maximum overlap allowed between an anchor
+            and ground-truth box for the anchor to be a negative sample, the default
+            value is 0.4. :attr:`negative_overlap` should be less than or equal to
+            :attr:`positive_overlap`, if not, the actual value of
+            :attr:`positive_overlap` is :attr:`negative_overlap`.
    Returns:
-        tuple:
+        A tuple with 6 Variables:
-               A tuple(predicted_scores, predicted_location, target_label,
-               target_bbox, bbox_inside_weight, fg_num) is returned. The
+        **predict_scores** (Variable): A 2-D Tensor with shape :math:`[F+B, C]` represents
-               predicted_scores and predicted_location are the predicted result
+        category prediction belonging to positive and negative samples. :math:`F`
-               of the retinanet.The target_label and target_bbox are the ground
+        is the number of positive samples in a mini-batch, :math:`B` is the number
-               truth, respectively. The predicted_location is a 2D Tensor with
+        of negative samples, and :math:`C` is the number of categories
-               shape [F, 4], and the shape of target_bbox is same as the shape of
+        (**Notice: excluding background**). The data type of :attr:`predict_scores`
-               the predicted_location, F is the number of the foreground
+        is float32 or float64.
-               anchors. The predicted_scores is a 2D Tensor with shape
-               [F + B, C], and the shape of target_label is [F + B, 1], B is the
+        **predict_location** (Variable): A 2-D Tensor with shape :math:`[F, 4]` represents
-               number of the background anchors, the F and B is depends on the
+        location prediction belonging to positive samples. :math:`F` is the number
-               input of this operator. Bbox_inside_weight represents whether the
+        of positive samples. :math:`F` is the number of positive samples, and each
-               predicted location is fake foreground or not and the shape is [F, 4].
+        sample has 4 coordinate values. The data type of :attr:`predict_location`
-               Fg_num is the foreground number (including fake foreground) which
+        is float32 or float64.
-               is needed by focal loss.
+        **target_label** (Variable): A 2-D Tensor with shape :math:`[F+B, 1]` represents
+        target labels for classification belonging to positive and negative
+        samples. :math:`F` is the number of positive samples, :math:`B` is the
+        number of negative, and each sample has one target category. The data type
+        of :attr:`target_label` is int32.
+        **target_bbox** (Variable): A 2-D Tensor with shape :math:`[F, 4]` represents
+        target locations for box regression belonging to positive samples.
+        :math:`F` is the number of positive samples, and each sample has 4
+        coordinate values. The data type of :attr:`target_bbox` is float32 or
+        float64.
+        **bbox_inside_weight** (Variable): A 2-D Tensor with shape :math:`[F, 4]`
+        represents whether a positive sample is fake positive, if a positive
+        sample is false positive, the corresponding entries in
+        :attr:`bbox_inside_weight` are set 0, otherwise 1. :math:`F` is the number
+        of total positive samples in a mini-batch, and each sample has 4
+        coordinate values. The data type of :attr:`bbox_inside_weight` is float32
+        or float64.
+        **fg_num** (Variable): A 2-D Tensor with shape :math:`[N, 1]` represents the number
+        of positive samples. :math:`N` is the batch size. **Notice: The number
+        of positive samples is used as the denominator of later loss function,
+        to avoid the condition that the denominator is zero, this OP has added 1
+        to the actual number of positive samples of each image.** The data type of
+        :attr:`fg_num` is int32.
    Examples:
        .. code-block:: python
          import paddle.fluid as fluid
-          bbox_pred = layers.data(name='bbox_pred', shape=[1, 100, 4],
+          bbox_pred = fluid.data(name='bbox_pred', shape=[1, 100, 4],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          cls_logits = layers.data(name='cls_logits', shape=[1, 100, 10],
+          cls_logits = fluid.data(name='cls_logits', shape=[1, 100, 10],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          anchor_box = layers.data(name='anchor_box', shape=[100, 4],
+          anchor_box = fluid.data(name='anchor_box', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          anchor_var = layers.data(name='anchor_var', shape=[100, 4],
+          anchor_var = fluid.data(name='anchor_var', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+          gt_boxes = fluid.data(name='gt_boxes', shape=[10, 4],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          gt_labels = layers.data(name='gt_labels', shape=[10, 1],
+          gt_labels = fluid.data(name='gt_labels', shape=[10, 1],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
+          is_crowd = fluid.data(name='is_crowd', shape=[1],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
+          im_info = fluid.data(name='im_infoss', shape=[1, 3],
-                            append_batch_size=False, dtype='float32')
+                            dtype='float32')
-          loc_pred, score_pred, loc_target, score_target, bbox_inside_weight, fg_num =
+          score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num =
                fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box,
                anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10)
@@ -370,48 +436,61 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2, alpha=0.25):
    """
    **Sigmoid Focal Loss Operator.**
-    Focal loss is used to address the foreground-background class imbalance existed
+    `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is used to address the foreground-background
-    on the training phase of one-stage detectors. This operator computes the sigmoid
+    class imbalance existed on the training phase of many computer vision tasks. This OP computes
-    value for each element in the input tensor, after which focal loss is measured.
+    the sigmoid value for each element in the input tensor :attr:`x`, after which focal loss is
+    measured between the sigmoid value and target label. 
    The focal loss is given as followed:
    .. math::
-        loss_j = (-label_j * alpha * {(1 - \\sigma(x_j))}^{gamma} * \\log(\\sigma(x_j)) -
-        (1 - labels_j) * (1 - alpha) * {(\sigma(x_j)}^{ gamma} * \\log(1 - \\sigma(x_j)))
+        \\mathop{loss_{i,\\,j}}\\limits_{i\\in\\mathbb{[0,\\,N-1]},\\,j\\in\\mathbb{[0,\\,C-1]}}=\\left\\{
-        / fg\_num, j = 1,...,K
+        \\begin{array}{rcl}
+        - \\frac{1}{fg\_num} * \\alpha * {(1 - \\sigma(x_{i,\\,j}))}^{\\gamma} * \\log(\\sigma(x_{i,\\,j})) & & {(j +1) = label_{i,\\,0}} \\\\
+        - \\frac{1}{fg\_num} * (1 - \\alpha) * {\sigma(x_{i,\\,j})}^{ \\gamma} * \\log(1 - \\sigma(x_{i,\\,j})) & & {(j +1)!= label_{i,\\,0}}
+        \\end{array} \\right.
    We know that
    .. math::
        \\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)}
-    Args:
-        x(Variable): A 2-D tensor with shape [N, D], where N is the batch size and D is the number
-            of classes (excluding background). This input is a tensor of logits computed by the
-            previous operator.
-        label(Variable): A 2-D tensor with shape [N, 1], which is the probabilistic labels.
-        fg_num(Variable): A 1-D tensor with shape [1], which is the number of foreground.
+    Args:
+        x(Variable): A 2-D tensor with shape :math:`[N, C]` represents the predicted categories of
+            all samples. :math:`N` is the number of all samples responsible for optimization in
+            a mini-batch, for example, samples are anchor boxes for object detection and :math:`N`
+            is the total number of positive and negative samples in a mini-batch; Samples are images
+            for image classification and :math:`N` is the number of images in a mini-batch. :math:`C`
+            is the number of classes (**Notice: excluding background**). The data type of :attr:`x` is
+            float32 or float64.
+        label(Variable): A 2-D tensor with shape :math:`[N, 1]` represents the target labels for
+            classification. :math:`N` is the number of all samples responsible for optimization in a
+            mini-batch, each sample has one target category. The values for positive samples are in the
+            range of :math:`[1, C]`, and the values for negative samples are 0. The data type of :attr:`label`
+            is int32.
+        fg_num(Variable): A 1-D tensor with shape [1] represents the number of positive samples in a
+            mini-batch, which should be obtained before this OP. The data type of :attr:`fg_num` is int32.
        gamma(float): Hyper-parameter to balance the easy and hard examples. Default value is
            set to 2.0.
        alpha(float): Hyper-parameter to balance the positive and negative example. Default value
            is set to 0.25.
    Returns:
-        out(Variable): A 2-D tensor with shape [N, D], which is the focal loss.
+        Variable(the data type is float32 or float64): 
+            A 2-D tensor with shape :math:`[N, C]`, which is the focal loss of each element in the input
+            tensor :attr:`x`.
    Examples:
        .. code-block:: python
            import paddle.fluid as fluid
-            input = fluid.layers.data(
+            input = fluid.data(name='data', shape=[10,80], dtype='float32')
-                name='data', shape=[10,80], append_batch_size=False, dtype='float32')
+            label = fluid.data(name='label', shape=[10,1], dtype='int32')
-            label = fluid.layers.data(
+            fg_num = fluid.data(name='fg_num', shape=[1], dtype='int32')
-                name='label', shape=[10,1], append_batch_size=False, dtype='int32')
-            fg_num = fluid.layers.data(
-                name='fg_num', shape=[1], append_batch_size=False, dtype='int32')
            loss = fluid.layers.sigmoid_focal_loss(x=input,
                                                   label=label,
                                                   fg_num=fg_num,
@@ -2654,78 +2733,109 @@ def retinanet_detection_output(bboxes,
                               nms_threshold=0.3,
                               nms_eta=1.):
    """
-    **Detection Output Layer for Retinanet.**
+    **Detection Output Layer for the detector RetinaNet.**
-    This operation is to get the detection results by performing following
+    In the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ , many 
-    steps:
+    `FPN <https://arxiv.org/abs/1612.03144>`_ levels output the category
+    and location predictions, this OP is to get the detection results by
+    performing following steps:
-    1. Decode top-scoring bounding box predictions per FPN level according 
+    1. For each FPN level, decode box predictions according to the anchor
-       to the anchor boxes.
+       boxes from at most :attr:`nms_top_k` top-scoring predictions after
+       thresholding detector confidence at :attr:`score_threshold`.
    2. Merge top predictions from all levels and apply multi-class non 
       maximum suppression (NMS) on them to get the final detections.
    Args:
-        bboxes(List): A list of tensors from multiple FPN levels. Each
+        bboxes(List): A list of Tensors from multiple FPN levels represents
-            element is a 3-D Tensor with shape [N, Mi, 4] representing the
+            the location prediction for all anchor boxes. Each element is
-            predicted locations of Mi bounding boxes. N is the batch size,
+            a 3-D Tensor with shape :math:`[N, Mi, 4]`, :math:`N` is the
-            Mi is the number of bounding boxes from i-th FPN level and each 
+            batch size, :math:`Mi` is the number of bounding boxes from
-            bounding box has four coordinate values and the layout is
+            :math:`i`-th FPN level and each bounding box has four coordinate
-            [xmin, ymin, xmax, ymax].
+            values and the layout is [xmin, ymin, xmax, ymax]. The data type
-        scores(List): A list of tensors from multiple FPN levels. Each
+            of each element is float32 or float64.
-            element is a 3-D Tensor with shape [N, Mi, C] representing the
+        scores(List): A list of Tensors from multiple FPN levels represents
-            predicted confidence predictions. N is the batch size, C is the
+            the category prediction for all anchor boxes. Each element is a
-            class number (excluding background), Mi is the number of bounding
+            3-D Tensor with shape :math:`[N, Mi, C]`,  :math:`N` is the batch
-            boxes from i-th FPN level. For each bounding box, there are total
+            size, :math:`C` is the class number (**excluding background**),
-            C scores.
+            :math:`Mi` is the number of bounding boxes from :math:`i`-th FPN
-        anchors(List): A 2-D Tensor with shape [Mi, 4] represents the locations
+            level. The data type of each element is float32 or float64.
-            of Mi anchor boxes from all FPN level. Each bounding box has four
+        anchors(List): A list of Tensors from multiple FPN levels represents
+            the locations of all anchor boxes. Each element is a 2-D Tensor
+            with shape :math:`[Mi, 4]`, :math:`Mi` is the number of bounding
+            boxes from :math:`i`-th FPN level, and each bounding box has four
            coordinate values and the layout is [xmin, ymin, xmax, ymax].
-        im_info(Variable): A 2-D LoDTensor with shape [N, 3] represents the
+            The data type of each element is float32 or float64.
-            image information. N is the batch size, each image information
+        im_info(Variable): A 2-D Tensor with shape :math:`[N, 3]` represents the size
-            includes height, width and scale.
+            information of input images. :math:`N` is the batch size, the size
+            informarion of each image is a 3-vector which are the height and width
+            of the network input along with the factor scaling the origin image to
+            the network input. The data type of :attr:`im_info` is float32.
        score_threshold(float): Threshold to filter out bounding boxes
-            with a confidence score.
+            with a confidence score before NMS, default value is set to 0.05.
        nms_top_k(int): Maximum number of detections per FPN layer to be
-            kept according to the confidences before NMS.
+            kept according to the confidences before NMS, default value is set to
+            1000.
        keep_top_k(int): Number of total bounding boxes to be kept per image after
-            NMS step. -1 means keeping all bounding boxes after NMS step.
+            NMS step. Default value is set to 100, -1 means keeping all bounding
-        nms_threshold(float): The threshold to be used in NMS.
+            boxes after NMS step.
-        nms_eta(float): The parameter for adaptive NMS.
+        nms_threshold(float): The Intersection-over-Union(IoU) threshold used to 
+            filter out boxes in NMS.
+        nms_eta(float): The parameter for adjusting :attr:`nms_threshold` in NMS.
+            Default value is set to 1., which represents the value of
+            :attr:`nms_threshold` keep the same in NMS. If :attr:`nms_eta` is set
+            to be lower than 1. and the value of :attr:`nms_threshold` is set to
+            be higher than 0.5, everytime a bounding box is filtered out,
+            the adjustment for :attr:`nms_threshold` like :attr:`nms_threshold`
+            = :attr:`nms_threshold` * :attr:`nms_eta`  will not be stopped until
+            the actual value of :attr:`nms_threshold` is lower than or equal to
+            0.5.
+    **Notice**: In some cases where the image sizes are very small, it's possible
+    that there is no detection if :attr:`score_threshold` are used at all
+    levels. Hence, this OP do not filter out anchors from the highest FPN level
+    before NMS. And the last element in :attr:`bboxes`:, :attr:`scores` and
+    :attr:`anchors` is required to be from the hightest FPN level.
    Returns:
-        Variable:
+        Variable(The data type is float32 or float64):
-            The detection output is a LoDTensor with shape [No, 6].
+            The detection output is a 1-level LoDTensor with shape :math:`[No, 6]`.
            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
-            `No` is the total number of detections in this mini-batch. For each
+            :math:`No` is the total number of detections in this mini-batch.
-            instance, the offsets in first dimension are called LoD, the offset
+            The :math:`i`-th image has `LoD[i + 1] - LoD[i]` detected
-            number is N + 1, N is the batch size. The i-th image has
+            results, if `LoD[i + 1] - LoD[i]` is 0, the :math:`i`-th image
-            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
            has no detected results. If all images have no detected results,
            LoD will be set to 0, and the output tensor is empty (None).
    Examples:
        .. code-block:: python
-            import paddle.fluid as fluid
-            bboxes = layers.data(name='bboxes', shape=[1, 21, 4],
+           import paddle.fluid as fluid
-                append_batch_size=False, dtype='float32')
-            scores = layers.data(name='scores', shape=[1, 21, 10],
+           bboxes_low = fluid.data(
-                append_batch_size=False, dtype='float32')
+               name='bboxes_low', shape=[1, 44, 4], dtype='float32')
-            anchors = layers.data(name='anchors', shape=[21, 4],
+           bboxes_high = fluid.data(
-                append_batch_size=False, dtype='float32')
+               name='bboxes_high', shape=[1, 11, 4], dtype='float32')
-            im_info = layers.data(name="im_info", shape=[1, 3],
+           scores_low = fluid.data(
-                append_batch_size=False, dtype='float32')
+               name='scores_low', shape=[1, 44, 10], dtype='float32')
-            nmsed_outs = fluid.layers.retinanet_detection_output(
+           scores_high = fluid.data(
-                                                    bboxes=[bboxes, bboxes],
+               name='scores_high', shape=[1, 11, 10], dtype='float32')
-                                                    scores=[scores, scores],
+           anchors_low = fluid.data(
-                                                    anchors=[anchors, anchors],
+               name='anchors_low', shape=[44, 4], dtype='float32')
-                                                    im_info=im_info,
+           anchors_high = fluid.data(
-                                                    score_threshold=0.05,
+               name='anchors_high', shape=[11, 4], dtype='float32')
-                                                    nms_top_k=1000,
+           im_info = fluid.data(
-                                                    keep_top_k=100,
+               name="im_info", shape=[1, 3], dtype='float32')
-                                                    nms_threshold=0.3,
+           nmsed_outs = fluid.layers.retinanet_detection_output(
-                                                    nms_eta=1.)
+                                          bboxes=[bboxes_low, bboxes_high],
+                                          scores=[scores_low, scores_high],
+                                          anchors=[anchors_low, anchors_high],
+                                          im_info=im_info,
+                                          score_threshold=0.05,
+                                          nms_top_k=1000,
+                                          keep_top_k=100,
+                                          nms_threshold=0.45,
+                                          nms_eta=1.)
    """
    helper = LayerHelper('retinanet_detection_output', **locals())