diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 97eac4258da4047deab74acab389aa91964088ab..c5e6c558e67c375019afa4935e41c1f6f0f00efe 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -358,7 +358,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) -paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '4d170807a13d33925d1049d2892832bf')) +paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', 'eb62b1ff7cc981f3483a62321a491f2e')) paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc index e8a186b611e7bcc1ec7069b82148bb8f5f601532..5732b180526c502efea0ca72af87b38e45bfbec2 100644 --- a/paddle/fluid/operators/detection/yolov3_loss_op.cc +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -202,13 +202,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { the max IoU should be 1, and if the anchor box has IoU bigger than ignore thresh, the confidence score loss of this anchor box will be ignored. - Therefore, the yolov3 loss consist of three major parts, box location loss, - confidence score loss, and classification loss. The L1 loss is used for - box coordinates (w, h), and sigmoid cross entropy loss is used for box - coordinates (x, y), confidence score loss and classification loss. + Therefore, the yolov3 loss consists of three major parts: box location loss, + objectness loss and classification loss. The L1 loss is used for + box coordinates (w, h), sigmoid cross entropy loss is used for box + coordinates (x, y), objectness loss and classification loss. - Each groud truth box find a best matching anchor box in all anchors, - prediction of this anchor box will incur all three parts of losses, and + Each groud truth box finds a best matching anchor box in all anchors. + Prediction of this anchor box will incur all three parts of losses, and prediction of anchor boxes with no GT box matched will only incur objectness loss. diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3da5bedd39f35a12cf7815a466730f22e4fd4410..8aeaf4e92a6ba8bba44ba72a2206361f40e7c16b 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -525,8 +525,10 @@ def yolov3_loss(x, Args: x (Variable): ${x_comment} gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4], - in the third dimenstion, x, y, w, h should be stored - and x, y, w, h should be relative value of input image. + in the third dimenstion, x, y, w, h should be stored. + x,y is the center cordinate of boxes, w, h are the + width and height, x, y, w, h should be divided by + input image height to scale to [0, 1]. N is the batch number and B is the max box number in an image. gt_label (Variable): class id of ground truth boxes, shoud be in shape