未验证 提交 cf8cfb2a 编写于 作者: F FlyingQianMM 提交者: GitHub

Fix english doc api, invloves the op of retinanet_target_assign,...

Fix english doc api, invloves the op of retinanet_target_assign, sigmoid_focal_loss and retinanet_detection_output, cherry-pick, (#20419)

test=release/1.6
test=document_fix
上级 c7882f91
...@@ -417,8 +417,8 @@ paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'ne ...@@ -417,8 +417,8 @@ paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'ne
paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40')) paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40'))
paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b')) paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b'))
paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', 'd46629656b4ce9b07809e32c0482cbef')) paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', 'd46629656b4ce9b07809e32c0482cbef'))
paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595')) paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', '543b2a40641260e745a76b1f7a25fb2a'))
paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d')) paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', '4702891755596c8853aaeb874a5fdb46'))
paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', 'a7778d4f557c60dca52321673667690d')) paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', 'a7778d4f557c60dca52321673667690d'))
paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1')) paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1'))
paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'f2342042127b536a0a16390f149f1bba')) paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'f2342042127b536a0a16390f149f1bba'))
...@@ -432,7 +432,7 @@ paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_n ...@@ -432,7 +432,7 @@ paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_n
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ce2bfbd685f2a36eda400e00569908cb')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ce2bfbd685f2a36eda400e00569908cb'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'c01ac2f1fced1ddd98574e71e877a6c2')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'c01ac2f1fced1ddd98574e71e877a6c2'))
paddle.fluid.layers.multiclass_nms2 (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'return_index', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, False, None)), ('document', 'be156186ee7a2ee56ab30b964acb15e5')) paddle.fluid.layers.multiclass_nms2 (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'return_index', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, False, None)), ('document', 'be156186ee7a2ee56ab30b964acb15e5'))
paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8')) paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '488d24c6bd767b8c4422521f15d86c66'))
paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1f2b6bfb3027ea63ab86859391f45b03')) paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1f2b6bfb3027ea63ab86859391f45b03'))
paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8874f917b4da34541efe427841a8f205')) paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8874f917b4da34541efe427841a8f205'))
paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ff4a651d65a9a9f9da71349ba6a2dc1f')) paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ff4a651d65a9a9f9da71349ba6a2dc1f'))
......
...@@ -73,100 +73,166 @@ def retinanet_target_assign(bbox_pred, ...@@ -73,100 +73,166 @@ def retinanet_target_assign(bbox_pred,
positive_overlap=0.5, positive_overlap=0.5,
negative_overlap=0.4): negative_overlap=0.4):
""" """
**Target Assign Layer for Retinanet .** **Target Assign Layer for the detector RetinaNet.**
This OP finds out positive and negative samples from all anchors
for training the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ ,
and assigns target labels for classification along with target locations for
regression to each sample, then takes out the part belonging to positive and
negative samples from category prediction( :attr:`cls_logits`) and location
prediction( :attr:`bbox_pred`) which belong to all anchors.
The searching principles for positive and negative samples are as followed:
1. Anchors are assigned to ground-truth boxes when it has the highest IoU
overlap with a ground-truth box.
2. Anchors are assigned to ground-truth boxes when it has an IoU overlap
higher than :attr:`positive_overlap` with any ground-truth box.
3. Anchors are assigned to background when its IoU overlap is lower than
:attr:`negative_overlap` for all ground-truth boxes.
4. Anchors which do not meet the above conditions do not participate in
the training process.
Retinanet predicts a :math:`C`-vector for classification and a 4-vector for box
regresion for each anchor, hence the target label for each positive(or negative)
sample is a :math:`C`-vector and the target locations for each positive sample
is a 4-vector. As for a positive sample, if the category of its assigned
ground-truth box is class :math:`i`, the corresponding entry in its length
:math:`C` label vector is set to 1 and all other entries is set to 0, its box
regression targets are computed as the offset between itself and its assigned
ground-truth box. As for a negative sample, all entries in its length :math:`C`
label vector are set to 0 and box regression targets are omitted because
negative samples do not participate in the training process of location
regression.
After the assignment, the part belonging to positive and negative samples is
taken out from category prediction( :attr:`cls_logits` ), and the part
belonging to positive samples is taken out from location
prediction( :attr:`bbox_pred` ).
This layer can be, for given the Intersection-over-Union (IoU) overlap
between anchors and ground truth boxes, to assign classification and
regression targets to each anchor, these target labels are used for training
retinanet. Every anchor is assigned with a length :attr:`num_classes`
one-hot vector of classification targets, and a 4-vector of box regression
targets. The assignment rules are as followed:
1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
than positive_overlap(0.5) with any ground-truth box.
2. Anchors are assigned to background when its IoU ratio is lower than
negative_overlap (0.4) for all ground-truth boxes.
When an anchor is assigned with a ground-truth box which is the i-th category,
the i-th entry in its C vector of targets is set to 1 and all other entries
are set to 0. When an anchor is assigned with background, all entries are set
to 0. Anchors that are not assigned do not contribute to the training
objective. The regression targets are the encoded ground-truth boxes
associated with the assigned anchors.
Args: Args:
bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the bbox_pred(Variable): A 3-D Tensor with shape :math:`[N, M, 4]` represents
predicted locations of M bounding bboxes. N is the batch size, the predicted locations of all anchors. :math:`N` is the batch size( the
and each bounding box has four coordinate values and the layout number of images in a mini-batch), :math:`M` is the number of all anchors
is [xmin, ymin, xmax, ymax]. of one image, and each anchor has 4 coordinate values. The data type of
cls_logits(Variable): A 3-D Tensor with shape [N, M, C] represents the :attr:`bbox_pred` is float32 or float64.
predicted confidence predictions. N is the batch size, C is the cls_logits(Variable): A 3-D Tensor with shape :math:`[N, M, C]` represents
number of classes (excluding background), M is number of bounding boxes. the predicted categories of all anchors. :math:`N` is the batch size,
anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes, :math:`M` is the number of all anchors of one image, and :math:`C` is
each box is represented as [xmin, ymin, xmax, ymax], the number of categories (**Notice: excluding background**). The data type
[xmin, ymin] is the left top coordinate of the anchor box, of :attr:`cls_logits` is float32 or float64.
if the input is image feature map, they are close to the origin anchor_box(Variable): A 2-D Tensor with shape :math:`[M, 4]` represents
of the coordinate system. [xmax, ymax] is the right bottom the locations of all anchors. :math:`M` is the number of all anchors of
coordinate of the anchor box. one image, each anchor is represented as :math:`[xmin, ymin, xmax, ymax]`,
anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded :math:`[xmin, ymin]` is the left top coordinate of the anchor box,
variances of anchors. :math:`[xmax, ymax]` is the right bottom coordinate of the anchor box.
gt_boxes(Variable): The ground-truth bounding boxes (bboxes) are a 2D The data type of :attr:`anchor_box` is float32 or float64. Please refer
LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth to the OP :ref:`api_fluid_layers_anchor_generator`
bboxes of mini-batch input. for the generation of :attr:`anchor_box`.
gt_labels(variable): The ground-truth labels are a 2D LoDTensor with anchor_var(Variable): A 2-D Tensor with shape :math:`[M,4]` represents the expanded
shape [Ng, 1], Ng is the total number of ground-truth labels of factors of anchor locations used in loss function. :math:`M` is number of
mini-batch input. all anchors of one image, each anchor possesses a 4-vector expanded factor.
is_crowd(Variable): A 1-D LoDTensor which indicates ground-truth is crowd. The data type of :attr:`anchor_var` is float32 or float64. Please refer
im_info(Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size, to the OP :ref:`api_fluid_layers_anchor_generator`
3 is the height, width and scale. for the generation of :attr:`anchor_var`.
num_classes(int32): The number of classes. gt_boxes(Variable): A 1-level 2-D LoDTensor with shape :math:`[G, 4]` represents
positive_overlap(float): Minimum overlap required between an anchor locations of all ground-truth boxes. :math:`G` is the total number of
and ground-truth box for the (anchor, gt box) pair to be a positive all ground-truth boxes in a mini-batch, and each ground-truth box has 4
example. coordinate values. The data type of :attr:`gt_boxes` is float32 or
negative_overlap(float): Maximum overlap allowed between an anchor float64.
and ground-truth box for the (anchor, gt box) pair to be a negative gt_labels(variable): A 1-level 2-D LoDTensor with shape :math:`[G, 1]` represents
examples. categories of all ground-truth boxes, and the values are in the range of
:math:`[1, C]`. :math:`G` is the total number of all ground-truth boxes
in a mini-batch, and each ground-truth box has one category. The data type
of :attr:`gt_labels` is int32.
is_crowd(Variable): A 1-level 1-D LoDTensor with shape :math:`[G]` which
indicates whether a ground-truth box is a crowd. If the value is 1, the
corresponding box is a crowd, it is ignored during training. :math:`G` is
the total number of all ground-truth boxes in a mini-batch. The data type
of :attr:`is_crowd` is int32.
im_info(Variable): A 2-D Tensor with shape [N, 3] represents the size
information of input images. :math:`N` is the batch size, the size
informarion of each image is a 3-vector which are the height and width
of the network input along with the factor scaling the origin image to
the network input. The data type of :attr:`im_info` is float32.
num_classes(int32): The number of categories for classification, the default
value is 1.
positive_overlap(float32): Minimum overlap required between an anchor
and ground-truth box for the anchor to be a positive sample, the default
value is 0.5.
negative_overlap(float32): Maximum overlap allowed between an anchor
and ground-truth box for the anchor to be a negative sample, the default
value is 0.4. :attr:`negative_overlap` should be less than or equal to
:attr:`positive_overlap`, if not, the actual value of
:attr:`positive_overlap` is :attr:`negative_overlap`.
Returns: Returns:
tuple: A tuple with 6 Variables:
A tuple(predicted_scores, predicted_location, target_label,
target_bbox, bbox_inside_weight, fg_num) is returned. The **predict_scores** (Variable): A 2-D Tensor with shape :math:`[F+B, C]` represents
predicted_scores and predicted_location are the predicted result category prediction belonging to positive and negative samples. :math:`F`
of the retinanet.The target_label and target_bbox are the ground is the number of positive samples in a mini-batch, :math:`B` is the number
truth, respectively. The predicted_location is a 2D Tensor with of negative samples, and :math:`C` is the number of categories
shape [F, 4], and the shape of target_bbox is same as the shape of (**Notice: excluding background**). The data type of :attr:`predict_scores`
the predicted_location, F is the number of the foreground is float32 or float64.
anchors. The predicted_scores is a 2D Tensor with shape
[F + B, C], and the shape of target_label is [F + B, 1], B is the **predict_location** (Variable): A 2-D Tensor with shape :math:`[F, 4]` represents
number of the background anchors, the F and B is depends on the location prediction belonging to positive samples. :math:`F` is the number
input of this operator. Bbox_inside_weight represents whether the of positive samples. :math:`F` is the number of positive samples, and each
predicted location is fake foreground or not and the shape is [F, 4]. sample has 4 coordinate values. The data type of :attr:`predict_location`
Fg_num is the foreground number (including fake foreground) which is float32 or float64.
is needed by focal loss.
**target_label** (Variable): A 2-D Tensor with shape :math:`[F+B, 1]` represents
target labels for classification belonging to positive and negative
samples. :math:`F` is the number of positive samples, :math:`B` is the
number of negative, and each sample has one target category. The data type
of :attr:`target_label` is int32.
**target_bbox** (Variable): A 2-D Tensor with shape :math:`[F, 4]` represents
target locations for box regression belonging to positive samples.
:math:`F` is the number of positive samples, and each sample has 4
coordinate values. The data type of :attr:`target_bbox` is float32 or
float64.
**bbox_inside_weight** (Variable): A 2-D Tensor with shape :math:`[F, 4]`
represents whether a positive sample is fake positive, if a positive
sample is false positive, the corresponding entries in
:attr:`bbox_inside_weight` are set 0, otherwise 1. :math:`F` is the number
of total positive samples in a mini-batch, and each sample has 4
coordinate values. The data type of :attr:`bbox_inside_weight` is float32
or float64.
**fg_num** (Variable): A 2-D Tensor with shape :math:`[N, 1]` represents the number
of positive samples. :math:`N` is the batch size. **Notice: The number
of positive samples is used as the denominator of later loss function,
to avoid the condition that the denominator is zero, this OP has added 1
to the actual number of positive samples of each image.** The data type of
:attr:`fg_num` is int32.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
bbox_pred = layers.data(name='bbox_pred', shape=[1, 100, 4], bbox_pred = fluid.data(name='bbox_pred', shape=[1, 100, 4],
append_batch_size=False, dtype='float32') dtype='float32')
cls_logits = layers.data(name='cls_logits', shape=[1, 100, 10], cls_logits = fluid.data(name='cls_logits', shape=[1, 100, 10],
append_batch_size=False, dtype='float32') dtype='float32')
anchor_box = layers.data(name='anchor_box', shape=[100, 4], anchor_box = fluid.data(name='anchor_box', shape=[100, 4],
append_batch_size=False, dtype='float32') dtype='float32')
anchor_var = layers.data(name='anchor_var', shape=[100, 4], anchor_var = fluid.data(name='anchor_var', shape=[100, 4],
append_batch_size=False, dtype='float32') dtype='float32')
gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], gt_boxes = fluid.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32') dtype='float32')
gt_labels = layers.data(name='gt_labels', shape=[10, 1], gt_labels = fluid.data(name='gt_labels', shape=[10, 1],
append_batch_size=False, dtype='float32') dtype='float32')
is_crowd = fluid.layers.data(name='is_crowd', shape=[1], is_crowd = fluid.data(name='is_crowd', shape=[1],
append_batch_size=False, dtype='float32') dtype='float32')
im_info = fluid.layers.data(name='im_infoss', shape=[1, 3], im_info = fluid.data(name='im_infoss', shape=[1, 3],
append_batch_size=False, dtype='float32') dtype='float32')
loc_pred, score_pred, loc_target, score_target, bbox_inside_weight, fg_num = score_pred, loc_pred, score_target, loc_target, bbox_inside_weight, fg_num =
fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box, fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box,
anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10) anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10)
...@@ -370,48 +436,61 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2, alpha=0.25): ...@@ -370,48 +436,61 @@ def sigmoid_focal_loss(x, label, fg_num, gamma=2, alpha=0.25):
""" """
**Sigmoid Focal Loss Operator.** **Sigmoid Focal Loss Operator.**
Focal loss is used to address the foreground-background class imbalance existed `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is used to address the foreground-background
on the training phase of one-stage detectors. This operator computes the sigmoid class imbalance existed on the training phase of many computer vision tasks. This OP computes
value for each element in the input tensor, after which focal loss is measured. the sigmoid value for each element in the input tensor :attr:`x`, after which focal loss is
measured between the sigmoid value and target label.
The focal loss is given as followed: The focal loss is given as followed:
.. math:: .. math::
loss_j = (-label_j * alpha * {(1 - \\sigma(x_j))}^{gamma} * \\log(\\sigma(x_j)) -
(1 - labels_j) * (1 - alpha) * {(\sigma(x_j)}^{ gamma} * \\log(1 - \\sigma(x_j))) \\mathop{loss_{i,\\,j}}\\limits_{i\\in\\mathbb{[0,\\,N-1]},\\,j\\in\\mathbb{[0,\\,C-1]}}=\\left\\{
/ fg\_num, j = 1,...,K \\begin{array}{rcl}
- \\frac{1}{fg\_num} * \\alpha * {(1 - \\sigma(x_{i,\\,j}))}^{\\gamma} * \\log(\\sigma(x_{i,\\,j})) & & {(j +1) = label_{i,\\,0}} \\\\
- \\frac{1}{fg\_num} * (1 - \\alpha) * {\sigma(x_{i,\\,j})}^{ \\gamma} * \\log(1 - \\sigma(x_{i,\\,j})) & & {(j +1)!= label_{i,\\,0}}
\\end{array} \\right.
We know that We know that
.. math:: .. math::
\\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)} \\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)}
Args:
x(Variable): A 2-D tensor with shape [N, D], where N is the batch size and D is the number
of classes (excluding background). This input is a tensor of logits computed by the
previous operator.
label(Variable): A 2-D tensor with shape [N, 1], which is the probabilistic labels.
fg_num(Variable): A 1-D tensor with shape [1], which is the number of foreground.
Args:
x(Variable): A 2-D tensor with shape :math:`[N, C]` represents the predicted categories of
all samples. :math:`N` is the number of all samples responsible for optimization in
a mini-batch, for example, samples are anchor boxes for object detection and :math:`N`
is the total number of positive and negative samples in a mini-batch; Samples are images
for image classification and :math:`N` is the number of images in a mini-batch. :math:`C`
is the number of classes (**Notice: excluding background**). The data type of :attr:`x` is
float32 or float64.
label(Variable): A 2-D tensor with shape :math:`[N, 1]` represents the target labels for
classification. :math:`N` is the number of all samples responsible for optimization in a
mini-batch, each sample has one target category. The values for positive samples are in the
range of :math:`[1, C]`, and the values for negative samples are 0. The data type of :attr:`label`
is int32.
fg_num(Variable): A 1-D tensor with shape [1] represents the number of positive samples in a
mini-batch, which should be obtained before this OP. The data type of :attr:`fg_num` is int32.
gamma(float): Hyper-parameter to balance the easy and hard examples. Default value is gamma(float): Hyper-parameter to balance the easy and hard examples. Default value is
set to 2.0. set to 2.0.
alpha(float): Hyper-parameter to balance the positive and negative example. Default value alpha(float): Hyper-parameter to balance the positive and negative example. Default value
is set to 0.25. is set to 0.25.
Returns: Returns:
out(Variable): A 2-D tensor with shape [N, D], which is the focal loss. Variable(the data type is float32 or float64):
A 2-D tensor with shape :math:`[N, C]`, which is the focal loss of each element in the input
tensor :attr:`x`.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid import paddle.fluid as fluid
input = fluid.layers.data( input = fluid.data(name='data', shape=[10,80], dtype='float32')
name='data', shape=[10,80], append_batch_size=False, dtype='float32') label = fluid.data(name='label', shape=[10,1], dtype='int32')
label = fluid.layers.data( fg_num = fluid.data(name='fg_num', shape=[1], dtype='int32')
name='label', shape=[10,1], append_batch_size=False, dtype='int32')
fg_num = fluid.layers.data(
name='fg_num', shape=[1], append_batch_size=False, dtype='int32')
loss = fluid.layers.sigmoid_focal_loss(x=input, loss = fluid.layers.sigmoid_focal_loss(x=input,
label=label, label=label,
fg_num=fg_num, fg_num=fg_num,
...@@ -2654,78 +2733,109 @@ def retinanet_detection_output(bboxes, ...@@ -2654,78 +2733,109 @@ def retinanet_detection_output(bboxes,
nms_threshold=0.3, nms_threshold=0.3,
nms_eta=1.): nms_eta=1.):
""" """
**Detection Output Layer for Retinanet.** **Detection Output Layer for the detector RetinaNet.**
This operation is to get the detection results by performing following In the detector `RetinaNet <https://arxiv.org/abs/1708.02002>`_ , many
steps: `FPN <https://arxiv.org/abs/1612.03144>`_ levels output the category
and location predictions, this OP is to get the detection results by
performing following steps:
1. Decode top-scoring bounding box predictions per FPN level according 1. For each FPN level, decode box predictions according to the anchor
to the anchor boxes. boxes from at most :attr:`nms_top_k` top-scoring predictions after
thresholding detector confidence at :attr:`score_threshold`.
2. Merge top predictions from all levels and apply multi-class non 2. Merge top predictions from all levels and apply multi-class non
maximum suppression (NMS) on them to get the final detections. maximum suppression (NMS) on them to get the final detections.
Args: Args:
bboxes(List): A list of tensors from multiple FPN levels. Each bboxes(List): A list of Tensors from multiple FPN levels represents
element is a 3-D Tensor with shape [N, Mi, 4] representing the the location prediction for all anchor boxes. Each element is
predicted locations of Mi bounding boxes. N is the batch size, a 3-D Tensor with shape :math:`[N, Mi, 4]`, :math:`N` is the
Mi is the number of bounding boxes from i-th FPN level and each batch size, :math:`Mi` is the number of bounding boxes from
bounding box has four coordinate values and the layout is :math:`i`-th FPN level and each bounding box has four coordinate
[xmin, ymin, xmax, ymax]. values and the layout is [xmin, ymin, xmax, ymax]. The data type
scores(List): A list of tensors from multiple FPN levels. Each of each element is float32 or float64.
element is a 3-D Tensor with shape [N, Mi, C] representing the scores(List): A list of Tensors from multiple FPN levels represents
predicted confidence predictions. N is the batch size, C is the the category prediction for all anchor boxes. Each element is a
class number (excluding background), Mi is the number of bounding 3-D Tensor with shape :math:`[N, Mi, C]`, :math:`N` is the batch
boxes from i-th FPN level. For each bounding box, there are total size, :math:`C` is the class number (**excluding background**),
C scores. :math:`Mi` is the number of bounding boxes from :math:`i`-th FPN
anchors(List): A 2-D Tensor with shape [Mi, 4] represents the locations level. The data type of each element is float32 or float64.
of Mi anchor boxes from all FPN level. Each bounding box has four anchors(List): A list of Tensors from multiple FPN levels represents
the locations of all anchor boxes. Each element is a 2-D Tensor
with shape :math:`[Mi, 4]`, :math:`Mi` is the number of bounding
boxes from :math:`i`-th FPN level, and each bounding box has four
coordinate values and the layout is [xmin, ymin, xmax, ymax]. coordinate values and the layout is [xmin, ymin, xmax, ymax].
im_info(Variable): A 2-D LoDTensor with shape [N, 3] represents the The data type of each element is float32 or float64.
image information. N is the batch size, each image information im_info(Variable): A 2-D Tensor with shape :math:`[N, 3]` represents the size
includes height, width and scale. information of input images. :math:`N` is the batch size, the size
informarion of each image is a 3-vector which are the height and width
of the network input along with the factor scaling the origin image to
the network input. The data type of :attr:`im_info` is float32.
score_threshold(float): Threshold to filter out bounding boxes score_threshold(float): Threshold to filter out bounding boxes
with a confidence score. with a confidence score before NMS, default value is set to 0.05.
nms_top_k(int): Maximum number of detections per FPN layer to be nms_top_k(int): Maximum number of detections per FPN layer to be
kept according to the confidences before NMS. kept according to the confidences before NMS, default value is set to
1000.
keep_top_k(int): Number of total bounding boxes to be kept per image after keep_top_k(int): Number of total bounding boxes to be kept per image after
NMS step. -1 means keeping all bounding boxes after NMS step. NMS step. Default value is set to 100, -1 means keeping all bounding
nms_threshold(float): The threshold to be used in NMS. boxes after NMS step.
nms_eta(float): The parameter for adaptive NMS. nms_threshold(float): The Intersection-over-Union(IoU) threshold used to
filter out boxes in NMS.
nms_eta(float): The parameter for adjusting :attr:`nms_threshold` in NMS.
Default value is set to 1., which represents the value of
:attr:`nms_threshold` keep the same in NMS. If :attr:`nms_eta` is set
to be lower than 1. and the value of :attr:`nms_threshold` is set to
be higher than 0.5, everytime a bounding box is filtered out,
the adjustment for :attr:`nms_threshold` like :attr:`nms_threshold`
= :attr:`nms_threshold` * :attr:`nms_eta` will not be stopped until
the actual value of :attr:`nms_threshold` is lower than or equal to
0.5.
**Notice**: In some cases where the image sizes are very small, it's possible
that there is no detection if :attr:`score_threshold` are used at all
levels. Hence, this OP do not filter out anchors from the highest FPN level
before NMS. And the last element in :attr:`bboxes`:, :attr:`scores` and
:attr:`anchors` is required to be from the hightest FPN level.
Returns: Returns:
Variable: Variable(The data type is float32 or float64):
The detection output is a LoDTensor with shape [No, 6]. The detection output is a 1-level LoDTensor with shape :math:`[No, 6]`.
Each row has six values: [label, confidence, xmin, ymin, xmax, ymax]. Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
`No` is the total number of detections in this mini-batch. For each :math:`No` is the total number of detections in this mini-batch.
instance, the offsets in first dimension are called LoD, the offset The :math:`i`-th image has `LoD[i + 1] - LoD[i]` detected
number is N + 1, N is the batch size. The i-th image has results, if `LoD[i + 1] - LoD[i]` is 0, the :math:`i`-th image
`LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
has no detected results. If all images have no detected results, has no detected results. If all images have no detected results,
LoD will be set to 0, and the output tensor is empty (None). LoD will be set to 0, and the output tensor is empty (None).
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.fluid as fluid
bboxes = layers.data(name='bboxes', shape=[1, 21, 4], import paddle.fluid as fluid
append_batch_size=False, dtype='float32')
scores = layers.data(name='scores', shape=[1, 21, 10], bboxes_low = fluid.data(
append_batch_size=False, dtype='float32') name='bboxes_low', shape=[1, 44, 4], dtype='float32')
anchors = layers.data(name='anchors', shape=[21, 4], bboxes_high = fluid.data(
append_batch_size=False, dtype='float32') name='bboxes_high', shape=[1, 11, 4], dtype='float32')
im_info = layers.data(name="im_info", shape=[1, 3], scores_low = fluid.data(
append_batch_size=False, dtype='float32') name='scores_low', shape=[1, 44, 10], dtype='float32')
nmsed_outs = fluid.layers.retinanet_detection_output( scores_high = fluid.data(
bboxes=[bboxes, bboxes], name='scores_high', shape=[1, 11, 10], dtype='float32')
scores=[scores, scores], anchors_low = fluid.data(
anchors=[anchors, anchors], name='anchors_low', shape=[44, 4], dtype='float32')
im_info=im_info, anchors_high = fluid.data(
score_threshold=0.05, name='anchors_high', shape=[11, 4], dtype='float32')
nms_top_k=1000, im_info = fluid.data(
keep_top_k=100, name="im_info", shape=[1, 3], dtype='float32')
nms_threshold=0.3, nmsed_outs = fluid.layers.retinanet_detection_output(
nms_eta=1.) bboxes=[bboxes_low, bboxes_high],
scores=[scores_low, scores_high],
anchors=[anchors_low, anchors_high],
im_info=im_info,
score_threshold=0.05,
nms_top_k=1000,
keep_top_k=100,
nms_threshold=0.45,
nms_eta=1.)
""" """
helper = LayerHelper('retinanet_detection_output', **locals()) helper = LayerHelper('retinanet_detection_output', **locals())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册