diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index de74902212c749c982811d0f881538a88650370e..87dd94bb17a954bdee64af1794ec63cb9ca08f02 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -33,6 +33,7 @@ import six import numpy as np from functools import reduce from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype +from paddle.utils import deprecated __all__ = [ 'prior_box', @@ -998,6 +999,7 @@ def polygon_box_transform(input, name=None): return output +@deprecated(since="2.0.0", update_to="paddle.vision.ops.yolo_loss") @templatedoc(op_type="yolov3_loss") def yolov3_loss(x, gt_box, @@ -1127,6 +1129,7 @@ def yolov3_loss(x, return loss +@deprecated(since="2.0.0", update_to="paddle.vision.ops.yolo_box") @templatedoc(op_type="yolo_box") def yolo_box(x, img_size, diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py index ef53d8cec34a2ed1ce3db013094452b2ab9e7108..844115d4acecc40b92e2f14dbc8f2a4f3df920dd 100644 --- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py @@ -18,6 +18,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle from paddle.fluid import core @@ -151,5 +152,44 @@ class TestYoloBoxOpScaleXY(TestYoloBoxOp): self.scale_x_y = 1.2 +class TestYoloBoxDygraph(unittest.TestCase): + def test_dygraph(self): + paddle.disable_static() + x = np.random.random([2, 14, 8, 8]).astype('float32') + img_size = np.ones((2, 2)).astype('int32') + + x = paddle.to_tensor(x) + img_size = paddle.to_tensor(img_size) + + boxes, scores = paddle.vision.ops.yolo_box( + x, + img_size=img_size, + anchors=[10, 13, 16, 30], + class_num=2, + conf_thresh=0.01, + downsample_ratio=8, + clip_bbox=True, + scale_x_y=1.) + assert boxes is not None and scores is not None + paddle.enable_static() + + +class TestYoloBoxStatic(unittest.TestCase): + def test_static(self): + x = paddle.static.data('x', [2, 14, 8, 8], 'float32') + img_size = paddle.static.data('img_size', [2, 2], 'int32') + + boxes, scores = paddle.vision.ops.yolo_box( + x, + img_size=img_size, + anchors=[10, 13, 16, 30], + class_num=2, + conf_thresh=0.01, + downsample_ratio=8, + clip_bbox=True, + scale_x_y=1.) + assert boxes is not None and scores is not None + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index db73160c489b0584ab33b11061f0cc3f81f7da38..1ec1d1527e178a35fba5a2b949191f72e0a7726b 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -20,6 +20,7 @@ from scipy.special import logit from scipy.special import expit from op_test import OpTest +import paddle from paddle.fluid import core @@ -281,5 +282,66 @@ class TestYolov3LossWithScaleXY(TestYolov3LossOp): self.scale_x_y = 1.2 +class TestYolov3LossDygraph(unittest.TestCase): + def test_dygraph(self): + paddle.disable_static() + x = np.random.random([2, 14, 8, 8]).astype('float32') + gt_box = np.random.random([2, 10, 4]).astype('float32') + gt_label = np.random.random([2, 10]).astype('int32') + + x = paddle.to_tensor(x) + gt_box = paddle.to_tensor(gt_box) + gt_label = paddle.to_tensor(gt_label) + + loss = paddle.vision.ops.yolo_loss( + x, + gt_box=gt_box, + gt_label=gt_label, + anchors=[10, 13, 16, 30], + anchor_mask=[0, 1], + class_num=2, + ignore_thresh=0.7, + downsample_ratio=8, + use_label_smooth=True, + scale_x_y=1.) + assert loss is not None + paddle.enable_static() + + +class TestYolov3LossStatic(unittest.TestCase): + def test_static(self): + x = paddle.static.data('x', [2, 14, 8, 8], 'float32') + gt_box = paddle.static.data('gt_box', [2, 10, 4], 'float32') + gt_label = paddle.static.data('gt_label', [2, 10], 'int32') + gt_score = paddle.static.data('gt_score', [2, 10], 'float32') + + loss = paddle.vision.ops.yolo_loss( + x, + gt_box=gt_box, + gt_label=gt_label, + anchors=[10, 13, 16, 30], + anchor_mask=[0, 1], + class_num=2, + ignore_thresh=0.7, + downsample_ratio=8, + gt_score=gt_score, + use_label_smooth=True, + scale_x_y=1.) + assert loss is not None + + loss = paddle.vision.ops.yolo_loss( + x, + gt_box=gt_box, + gt_label=gt_label, + anchors=[10, 13, 16, 30], + anchor_mask=[0, 1], + class_num=2, + ignore_thresh=0.7, + downsample_ratio=8, + use_label_smooth=True, + scale_x_y=1.) + assert loss is not None + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index db5a94f93293471f92407d739b0f094652de4ceb..aeb07bf281fb0a0289640e0591af4d864ca10b39 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -24,6 +24,8 @@ from .datasets import * from . import image from .image import * +from . import ops + __all__ = models.__all__ \ + transforms.__all__ \ + datasets.__all__ \ diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..892f3a258146a0c860f105837ba037e18d78b633 --- /dev/null +++ b/python/paddle/vision/ops.py @@ -0,0 +1,388 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from ..fluid.layer_helper import LayerHelper +from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype +from ..fluid import core, layers + +from paddle.common_ops_import import * + +__all__ = ['yolo_loss', 'yolo_box'] + + +def yolo_loss(x, + gt_box, + gt_label, + anchors, + anchor_mask, + class_num, + ignore_thresh, + downsample_ratio, + gt_score=None, + use_label_smooth=True, + name=None, + scale_x_y=1.): + """ + + This operator generates YOLOv3 loss based on given predict result and ground + truth boxes. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number bounding boxes, this given number, which following will be represented as S, + is specified by the number of anchor clusters in each scale. In the second dimension(the channel + dimension), C should be equal to S * (class_num + 5), class_num is the object + category number of source dataset(such as 80 in coco dataset), so in the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor box. + + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions + should be as follows: + + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + + In the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + + As for confidence score, it is the logistic regression value of IoU between + anchor boxes and ground truth boxes, the score of the anchor box which has + the max IoU should be 1, and if the anchor box has IoU bigger than ignore + thresh, the confidence score loss of this anchor box will be ignored. + + Therefore, the YOLOv3 loss consists of three major parts: box location loss, + objectness loss and classification loss. The L1 loss is used for + box coordinates (w, h), sigmoid cross entropy loss is used for box + coordinates (x, y), objectness loss and classification loss. + + Each groud truth box finds a best matching anchor box in all anchors. + Prediction of this anchor box will incur all three parts of losses, and + prediction of anchor boxes with no GT box matched will only incur objectness + loss. + + In order to trade off box coordinate losses between big boxes and small + boxes, box coordinate losses will be mutiplied by scale weight, which is + calculated as follows. + + $$ + weight_{box} = 2.0 - t_w * t_h + $$ + + Final loss will be represented as follows. + + $$ + loss = (loss_{xy} + loss_{wh}) * weight_{box} + + loss_{conf} + loss_{class} + $$ + + While :attr:`use_label_smooth` is set to be :attr:`True`, the classification + target will be smoothed when calculating classification loss, target of + positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of + negetive samples will be smoothed to :math:`1.0 / class\_num`. + + While :attr:`gt_score` is given, which means the mixup score of ground truth + boxes, all losses incured by a ground truth box will be multiplied by its + mixup score. + + Args: + x (Tensor): The input tensor of YOLOv3 loss operator, This is a 4-D + tensor with shape of [N, C, H, W]. H and W should be same, + and the second dimension(C) stores box locations, confidence + score and classification one-hot keys of each anchor box. + The data type is float32 or float64. + gt_box (Tensor): groud truth boxes, should be in shape of [N, B, 4], + in the third dimension, x, y, w, h should be stored. + x,y is the center coordinate of boxes, w, h are the + width and height, x, y, w, h should be divided by + input image height to scale to [0, 1]. + N is the batch number and B is the max box number in + an image.The data type is float32 or float64. + gt_label (Tensor): class id of ground truth boxes, should be in shape + of [N, B].The data type is int32. + anchors (list|tuple): The anchor width and height, it will be parsed + pair by pair. + anchor_mask (list|tuple): The mask index of anchors used in current + YOLOv3 loss calculation. + class_num (int): The number of classes. + ignore_thresh (float): The ignore threshold to ignore confidence loss. + downsample_ratio (int): The downsample ratio from network input to YOLOv3 + loss input, so 32, 16, 8 should be set for the + first, second, and thrid YOLOv3 loss operators. + name (string): The default value is None. Normally there is no need + for user to set this property. For more information, + please refer to :ref:`api_guide_Name` + gt_score (Tensor): mixup score of ground truth boxes, should be in shape + of [N, B]. Default None. + use_label_smooth (bool): Whether to use label smooth. Default True. + scale_x_y (float): Scale the center point of decoded bounding box. + Default 1.0 + + Returns: + Tensor: A 1-D tensor with shape [N], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Tensor + TypeError: Input gtbox of yolov3_loss must be Tensor + TypeError: Input gtlabel of yolov3_loss must be Tensor + TypeError: Input gtscore of yolov3_loss must be None or Tensor + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + TypeError: Attr use_label_smooth of yolov3_loss must be a bool value + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + x = np.random.random([2, 14, 8, 8]).astype('float32') + gt_box = np.random.random([2, 10, 4]).astype('float32') + gt_label = np.random.random([2, 10]).astype('int32') + + x = paddle.to_tensor(x) + gt_box = paddle.to_tensor(gt_box) + gt_label = paddle.to_tensor(gt_label) + + loss = paddle.vision.ops.yolo_loss(x, + gt_box=gt_box, + gt_label=gt_label, + anchors=[10, 13, 16, 30], + anchor_mask=[0, 1], + class_num=2, + ignore_thresh=0.7, + downsample_ratio=8, + use_label_smooth=True, + scale_x_y=1.) + """ + + if in_dygraph_mode() and gt_score is None: + loss = core.ops.yolov3_loss( + x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask, + 'class_num', class_num, 'ignore_thresh', ignore_thresh, + 'downsample_ratio', downsample_ratio, 'use_label_smooth', + use_label_smooth, 'scale_x_y', scale_x_y) + return loss + + helper = LayerHelper('yolov3_loss', **locals()) + + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'yolo_loss') + check_variable_and_dtype(gt_box, 'gt_box', ['float32', 'float64'], + 'yolo_loss') + check_variable_and_dtype(gt_label, 'gt_label', 'int32', 'yolo_loss') + check_type(anchors, 'anchors', (list, tuple), 'yolo_loss') + check_type(anchor_mask, 'anchor_mask', (list, tuple), 'yolo_loss') + check_type(class_num, 'class_num', int, 'yolo_loss') + check_type(ignore_thresh, 'ignore_thresh', float, 'yolo_loss') + check_type(use_label_smooth, 'use_label_smooth', bool, 'yolo_loss') + + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + + objectness_mask = helper.create_variable_for_type_inference(dtype='int32') + gt_match_mask = helper.create_variable_for_type_inference(dtype='int32') + + inputs = { + "X": x, + "GTBox": gt_box, + "GTLabel": gt_label, + } + if gt_score is not None: + inputs["GTScore"] = gt_score + + attrs = { + "anchors": anchors, + "anchor_mask": anchor_mask, + "class_num": class_num, + "ignore_thresh": ignore_thresh, + "downsample_ratio": downsample_ratio, + "use_label_smooth": use_label_smooth, + "scale_x_y": scale_x_y, + } + + helper.append_op( + type='yolov3_loss', + inputs=inputs, + outputs={ + 'Loss': loss, + 'ObjectnessMask': objectness_mask, + 'GTMatchMask': gt_match_mask + }, + attrs=attrs) + return loss + + +def yolo_box(x, + img_size, + anchors, + class_num, + conf_thresh, + downsample_ratio, + clip_bbox=True, + name=None, + scale_x_y=1.): + """ + + This operator generates YOLO detection boxes from output of YOLOv3 network. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number boxes, this given number, which following will be represented as S, + is specified by the number of anchors. In the second dimension(the channel + dimension), C should be equal to S * (5 + class_num), class_num is the object + category number of source dataset(such as 80 in coco dataset), so the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor + box. + + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box + predictions should be as follows: + + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + + in the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + + The logistic regression value of the 5th channel of each anchor prediction boxes + represents the confidence score of each prediction box, and the logistic + regression value of the last :attr:`class_num` channels of each anchor prediction + boxes represents the classifcation scores. Boxes with confidence scores less than + :attr:`conf_thresh` should be ignored, and box final scores is the product of + confidence scores and classification scores. + + $$ + score_{pred} = score_{conf} * score_{class} + $$ + + Args: + x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with + shape of [N, C, H, W]. The second dimension(C) stores box + locations, confidence score and classification one-hot keys + of each anchor box. Generally, X should be the output of + YOLOv3 network. The data type is float32 or float64. + img_size (Tensor): The image size tensor of YoloBox operator, This is a + 2-D tensor with shape of [N, 2]. This tensor holds + height and width of each input image used for resizing + output box in input image scale. The data type is int32. + anchors (list|tuple): The anchor width and height, it will be parsed pair + by pair. + class_num (int): The number of classes. + conf_thresh (float): The confidence scores threshold of detection boxes. + Boxes with confidence scores under threshold should + be ignored. + downsample_ratio (int): The downsample ratio from network input to + :attr:`yolo_box` operator input, so 32, 16, 8 + should be set for the first, second, and thrid + :attr:`yolo_box` layer. + clip_bbox (bool): Whether clip output bonding box in :attr:`img_size` + boundary. Default true." + " + scale_x_y (float): Scale the center point of decoded bounding box. + Default 1.0 + name (string): The default value is None. Normally there is no need + for user to set this property. For more information, + please refer to :ref:`api_guide_Name` + + Returns: + Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes, + and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification + scores of boxes. + + Raises: + TypeError: Input x of yolov_box must be Tensor + TypeError: Attr anchors of yolo box must be list or tuple + TypeError: Attr class_num of yolo box must be an integer + TypeError: Attr conf_thresh of yolo box must be a float number + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + + x = np.random.random([2, 14, 8, 8]).astype('float32') + img_size = np.ones((2, 2)).astype('int32') + + x = paddle.to_tensor(x) + img_size = paddle.to_tensor(img_size) + + boxes, scores = paddle.vision.ops.yolo_box(x, + img_size=img_size, + anchors=[10, 13, 16, 30], + class_num=2, + conf_thresh=0.01, + downsample_ratio=8, + clip_bbox=True, + scale_x_y=1.) + """ + if in_dygraph_mode(): + boxes, scores = core.ops.yolo_box( + x, img_size, 'anchors', anchors, 'class_num', class_num, + 'conf_thresh', conf_thresh, 'downsample_ratio', downsample_ratio, + 'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y) + return boxes, scores + + helper = LayerHelper('yolo_box', **locals()) + + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'yolo_box') + check_variable_and_dtype(img_size, 'img_size', 'int32', 'yolo_box') + check_type(anchors, 'anchors', (list, tuple), 'yolo_box') + check_type(conf_thresh, 'conf_thresh', float, 'yolo_box') + + boxes = helper.create_variable_for_type_inference(dtype=x.dtype) + scores = helper.create_variable_for_type_inference(dtype=x.dtype) + + attrs = { + "anchors": anchors, + "class_num": class_num, + "conf_thresh": conf_thresh, + "downsample_ratio": downsample_ratio, + "clip_bbox": clip_bbox, + "scale_x_y": scale_x_y, + } + + helper.append_op( + type='yolo_box', + inputs={ + "X": x, + "ImgSize": img_size, + }, + outputs={ + 'Boxes': boxes, + 'Scores': scores, + }, + attrs=attrs) + return boxes, scores