diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 65f3dc961f061d8bcc32ade26dd1fa33cbfc3971..6feaa7b484cfc81451d23e9a52942e54edb8f5be 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -177,7 +177,7 @@ paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3')) -paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', 'a5be881ada816e47ea7a6ee4396da357')) +paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', '79aaea078ddea57a82ed7906d71dedc7')) paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'eeb1591cfc854c6ffdac77b376313c44')) paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa')) paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', 'fe352915a543cec434f74e9b32ac49da')) @@ -277,7 +277,7 @@ paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_di paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '7637c974f2d749d359acae9062c4d96f')) paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '406eee439e41988c8a0304186626a0dd')) paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '26decdea9376b6b9a0d3432d82ca207b')) -paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f85b263b7b6698d000977529a28f202b')) +paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '315b50c1cbd9569375b098c56f1e91c9')) paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5b32ed21ab89140a8e758002923a0da3')) paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ecc4b1323028bde0518d666882d03515')) paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb')) @@ -416,13 +416,13 @@ paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'e9685f32d21bec8c013626c0254502c5')) paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40')) paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b')) -paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '651d98d51879dfa1bc1cd40391786a41')) +paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', 'd46629656b4ce9b07809e32c0482cbef')) paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595')) paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d')) paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', 'a7778d4f557c60dca52321673667690d')) paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1')) -paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', '69def376b42ef0681d0cc7f53a2dac4b')) -paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', 'b7d707822b6af2a586bce608040235b1')) +paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'f2342042127b536a0a16390f149f1bba')) +paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '5cba014b41610431f8949e2d7336f1cc')) paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'b319b10ddaf17fb4ddf03518685a17ef')) paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e24478fd1fcf1727d4947fe14356b3d4')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '511d7033c0cfce1a5b88c04ad6e7ed5b')) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index f216f5b4708f842ea579d6afad6de5769b2d2f9b..4a004f3033e8fffdea15dedec0946866fe98a4e4 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -254,70 +254,66 @@ def rpn_target_assign(bbox_pred, bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the predicted locations of M bounding bboxes. N is the batch size, and each bounding box has four coordinate values and the layout - is [xmin, ymin, xmax, ymax]. + is [xmin, ymin, xmax, ymax]. The data type can be float32 or float64. cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the predicted confidence predictions. N is the batch size, 1 is the frontground and background sigmoid, M is number of bounding boxes. + The data type can be float32 or float64. anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes, each box is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate of the anchor box, if the input is image feature map, they are close to the origin of the coordinate system. [xmax, ymax] is the right bottom - coordinate of the anchor box. + coordinate of the anchor box. The data type can be float32 or float64. anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded - variances of anchors. + variances of anchors. The data type can be float32 or float64. gt_boxes (Variable): The ground-truth bounding boxes (bboxes) are a 2D LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth - bboxes of mini-batch input. + bboxes of mini-batch input. The data type can be float32 or float64. is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd. + The data type must be int32. im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size, 3 is the height, width and scale. rpn_batch_size_per_im(int): Total number of RPN examples per image. + The data type must be int32. rpn_straddle_thresh(float): Remove RPN anchors that go outside the image - by straddle_thresh pixels. + by straddle_thresh pixels. The data type must be float32. rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled - foreground (i.e. class > 0), 0-th class is background. + foreground (i.e. class > 0), 0-th class is background. The data type must be float32. rpn_positive_overlap(float): Minimum overlap required between an anchor and ground-truth box for the (anchor, gt box) pair to be a positive - example. + example. The data type must be float32. rpn_negative_overlap(float): Maximum overlap allowed between an anchor and ground-truth box for the (anchor, gt box) pair to be a negative - examples. + examples. The data type must be float32. Returns: tuple: - A tuple(predicted_scores, predicted_location, target_label, - target_bbox, bbox_inside_weight) is returned. The predicted_scores - and predicted_location is the predicted result of the RPN. - The target_label and target_bbox is the ground truth, - respectively. The predicted_location is a 2D Tensor with shape - [F, 4], and the shape of target_bbox is same as the shape of - the predicted_location, F is the number of the foreground - anchors. The predicted_scores is a 2D Tensor with shape - [F + B, 1], and the shape of target_label is same as the shape - of the predicted_scores, B is the number of the background - anchors, the F and B is depends on the input of this operator. - Bbox_inside_weight represents whether the predicted loc is fake_fg - or not and the shape is [F, 4]. + A tuple(predicted_scores, predicted_location, target_label, + target_bbox, bbox_inside_weight) is returned. The predicted_scores + and predicted_location is the predicted result of the RPN. + The target_label and target_bbox is the ground truth, + respectively. The predicted_location is a 2D Tensor with shape + [F, 4], and the shape of target_bbox is same as the shape of + the predicted_location, F is the number of the foreground + anchors. The predicted_scores is a 2D Tensor with shape + [F + B, 1], and the shape of target_label is same as the shape + of the predicted_scores, B is the number of the background + anchors, the F and B is depends on the input of this operator. + Bbox_inside_weight represents whether the predicted loc is fake_fg + or not and the shape is [F, 4]. Examples: .. code-block:: python import paddle.fluid as fluid - bbox_pred = fluid.layers.data(name='bbox_pred', shape=[100, 4], - append_batch_size=False, dtype='float32') - cls_logits = fluid.layers.data(name='cls_logits', shape=[100, 1], - append_batch_size=False, dtype='float32') - anchor_box = fluid.layers.data(name='anchor_box', shape=[20, 4], - append_batch_size=False, dtype='float32') - anchor_var = fluid.layers.data(name='anchor_var', shape=[20, 4], - append_batch_size=False, dtype='float32') - gt_boxes = fluid.layers.data(name='gt_boxes', shape=[10, 4], - append_batch_size=False, dtype='float32') - is_crowd = fluid.layers.data(name='is_crowd', shape=[1], - append_batch_size=False, dtype='float32') - im_info = fluid.layers.data(name='im_infoss', shape=[1, 3], - append_batch_size=False, dtype='float32') + bbox_pred = fluid.data(name='bbox_pred', shape=[None, 4], dtype='float32') + cls_logits = fluid.data(name='cls_logits', shape=[None, 1], dtype='float32') + anchor_box = fluid.data(name='anchor_box', shape=[None, 4], dtype='float32') + anchor_var = fluid.data(name='anchor_var', shape=[None, 4], dtype='float32') + gt_boxes = fluid.data(name='gt_boxes', shape=[None, 4], dtype='float32') + is_crowd = fluid.data(name='is_crowd', shape=[None], dtype='float32') + im_info = fluid.data(name='im_infoss', shape=[None, 3], dtype='float32') loc, score, loc_target, score_target, inside_weight = fluid.layers.rpn_target_assign( bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes, is_crowd, im_info) @@ -2233,8 +2229,7 @@ def generate_proposal_labels(rpn_rois, is_cls_agnostic=False, is_cascade_rcnn=False): """ - - ** Generate Proposal Labels of Faster-RCNN ** + **Generate Proposal Labels of Faster-RCNN** This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth, to sample foreground boxes and background boxes, and compute loss target. @@ -2252,37 +2247,43 @@ def generate_proposal_labels(rpn_rois, Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss. Args: - rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format. - gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth. - is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd. + rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format. The data type can be float32 or float64. + gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth. The data type must be int32. + is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd. The data type must be int32. gt_boxes(Variable): A 2-D LoDTensor with shape [M, 4]. M is the number of groundtruth, each element is a bounding box with [xmin, ymin, xmax, ymax] format. im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the number of input images, each element consists of im_height, im_width, im_scale. - batch_size_per_im(int): Batch size of rois per images. - fg_fraction(float): Foreground fraction in total batch_size_per_im. - fg_thresh(float): Overlap threshold which is used to chose foreground sample. - bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample. - bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample. - bbox_reg_weights(list|tuple): Box regression weights. - class_nums(int): Class number. + batch_size_per_im(int): Batch size of rois per images. The data type must be int32. + fg_fraction(float): Foreground fraction in total batch_size_per_im. The data type must be float32. + fg_thresh(float): Overlap threshold which is used to chose foreground sample. The data type must be float32. + bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample. The data type must be float32. + bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample. The data type must be float32. + bbox_reg_weights(list|tuple): Box regression weights. The data type must be float32. + class_nums(int): Class number. The data type must be int32. use_random(bool): Use random sampling to choose foreground and background boxes. is_cls_agnostic(bool): bbox regression use class agnostic simply which only represent fg and bg boxes. is_cascade_rcnn(bool): it will filter some bbox crossing the image's boundary when setting True. + Returns: + tuple: + A tuple with format``(rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights)``. + + - **rois**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4]``. The data type is the same as ``rpn_rois``. + - **labels_int32**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 1]``. The data type must be int32. + - **bbox_targets**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The regression targets of all RoIs. The data type is the same as ``rpn_rois``. + - **bbox_inside_weights**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The weights of foreground boxes' regression loss. The data type is the same as ``rpn_rois``. + - **bbox_outside_weights**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The weights of regression loss. The data type is the same as ``rpn_rois``. + + Examples: .. code-block:: python import paddle.fluid as fluid - rpn_rois = fluid.layers.data(name='rpn_rois', shape=[2, 4], - append_batch_size=False, dtype='float32') - gt_classes = fluid.layers.data(name='gt_classes', shape=[8, 1], - append_batch_size=False, dtype='float32') - is_crowd = fluid.layers.data(name='is_crowd', shape=[8, 1], - append_batch_size=False, dtype='float32') - gt_boxes = fluid.layers.data(name='gt_boxes', shape=[8, 4], - append_batch_size=False, dtype='float32') - im_info = fluid.layers.data(name='im_info', shape=[10, 3], - append_batch_size=False, dtype='float32') + rpn_rois = fluid.data(name='rpn_rois', shape=[None, 4], dtype='float32') + gt_classes = fluid.data(name='gt_classes', shape=[None, 1], dtype='float32') + is_crowd = fluid.data(name='is_crowd', shape=[None, 1], dtype='float32') + gt_boxes = fluid.data(name='gt_boxes', shape=[None, 4], dtype='float32') + im_info = fluid.data(name='im_info', shape=[None, 3], dtype='float32') rois, labels, bbox, inside_weights, outside_weights = fluid.layers.generate_proposal_labels( rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, class_nums=10) @@ -2512,44 +2513,47 @@ def generate_proposals(scores, scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. N is batch size, A is number of anchors, H and W are height and - width of the feature map. + width of the feature map. The data type must be float32. bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and - anchor location. + anchor location. The data type must be float32. im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale between origin image size and the size of feature map. + The data type must be int32. anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, num_anchors is the box count of each position. Each anchor is - in (xmin, ymin, xmax, ymax) format an unnormalized. - variances(Variable): The expanded variances of anchors with a layout of + in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32. + variances(Variable): A 4-D Tensor. The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in - (xcenter, ycenter, w, h) format. + (xcenter, ycenter, w, h) format. The data type must be float32. pre_nms_top_n(float): Number of total bboxes to be kept per - image before NMS. 6000 by default. + image before NMS. The data type must be float32. `6000` by default. post_nms_top_n(float): Number of total bboxes to be kept per - image after NMS. 1000 by default. - nms_thresh(float): Threshold in NMS, 0.5 by default. + image after NMS. The data type must be float32. `1000` by default. + nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default. min_size(float): Remove predicted boxes with either height or - width < min_size. 0.1 by default. - eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, - adaptive_threshold = adaptive_threshold * eta in each iteration. + width < min_size. The data type must be float32. `0.1` by default. + eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, + `adaptive_threshold = adaptive_threshold * eta` in each iteration. + + Returns: + tuple: + A tuple with format ``(rpn_rois, rpn_roi_probs)``. + + - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. + - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. Examples: .. code-block:: python import paddle.fluid as fluid - scores = fluid.layers.data(name='scores', shape=[2, 4, 5, 5], - append_batch_size=False, dtype='float32') - bbox_deltas = fluid.layers.data(name='bbox_deltas', shape=[2, 16, 5, 5], - append_batch_size=False, dtype='float32') - im_info = fluid.layers.data(name='im_info', shape=[2, 3], - append_batch_size=False, dtype='float32') - anchors = fluid.layers.data(name='anchors', shape=[5, 5, 4, 4], - append_batch_size=False, dtype='float32') - variances = fluid.layers.data(name='variances', shape=[5, 5, 10, 4], - append_batch_size=False, dtype='float32') + scores = fluid.data(name='scores', shape=[None, 4, 5, 5], dtype='float32') + bbox_deltas = fluid.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32') + im_info = fluid.data(name='im_info', shape=[None, 3], dtype='float32') + anchors = fluid.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32') + variances = fluid.data(name='variances', shape=[None, 5, 10, 4], dtype='float32') rois, roi_probs = fluid.layers.generate_proposals(scores, bbox_deltas, im_info, anchors, variances) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 339ad431bab3f401e0fc70389d39efbb24b0eccf..eddc90f46afc11dd5f93dce439988c17c791c980 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6766,7 +6766,7 @@ def warpctc(input, (https://github.com/baidu-research/warp-ctc) to compute Connectionist Temporal Classification (CTC) loss. It can be aliased as softmax with CTC, since a native softmax activation is - interated to the Warp-CTC library, to to normlize values for each row of the + interated to the Warp-CTC library to normlize values for each row of the input tensor. Args: @@ -6778,14 +6778,15 @@ def warpctc(input, (not including the blank label). When it is a 3-D Tensor, it's shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the length of the longest - input logit sequence. + input logit sequence. The data type must be float32. label (Variable): The ground truth of variable-length sequence, which is a 2-D Tensor with LoD information or a 2-D Tensor without LoD information. When it is a 2-D LoDTensor or 2-D Tensor, it is of the shape [Lg, 1], where Lg is th sum of all labels' length. + The data type must be int32. blank (int, default 0): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the - half-opened interval [0, num_classes + 1). + half-opened interval [0, num_classes + 1). The data type must be int32. norm_by_times(bool, default false): Whether to normalize the gradients by the number of time-step, which is also the sequence's length. There is no need to normalize the gradients if warpctc layer was @@ -6797,40 +6798,72 @@ def warpctc(input, Returns: Variable: The Connectionist Temporal Classification (CTC) loss, - which is a 2-D Tensor of the shape [batch_size, 1]. + which is a 2-D Tensor with the shape [batch_size, 1]. + The date type is the same as input. Examples: + .. code-block:: python # using LoDTensor import paddle.fluid as fluid import numpy as np - label = fluid.layers.data(name='label', shape=[12, 1], - dtype='float32', lod_level=1) - predict = fluid.layers.data(name='predict', - shape=[11, 8], + predict = fluid.data(name='predict', + shape=[None, 5], dtype='float32',lod_level=1) + label = fluid.data(name='label', shape=[None, 1], + dtype='int32', lod_level=1) cost = fluid.layers.warpctc(input=predict, label=label) + place = fluid.CPUPlace() + x=fluid.LoDTensor() + data = np.random.rand(8, 5).astype("float32") + x.set(data, place) + x.set_lod([[0,4,8]]) + y=fluid.LoDTensor() + data = np.random.randint(0, 5, [4, 1]).astype("int32") + y.set(data, place) + y.set_lod([[0,2,4]]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + output= exe.run(feed={"predict": x,"label": y}, + fetch_list=[cost.name]) + print output + + .. code-block:: python # using Tensor - input_length = fluid.layers.data(name='logits_length', shape=[11], - dtype='int64') - label_length = fluid.layers.data(name='labels_length', shape=[12], - dtype='int64') - target = fluid.layers.data(name='target', shape=[12, 1], - dtype='int32') + import paddle.fluid as fluid + import numpy as np + # length of the longest logit sequence - max_seq_length = 4 + max_seq_length = 5 # number of logit sequences - batch_size = 4 - output = fluid.layers.data(name='output', - shape=[max_seq_length, batch_size, 8], + batch_size = None + logits = fluid.data(name='logits', + shape=[max_seq_length, batch_size, 5], dtype='float32') - loss = fluid.layers.warpctc(input=output,label=target, - input_length=input_length, + logits_length = fluid.data(name='logits_length', shape=[None], + dtype='int64') + label = fluid.layers.data(name='label', shape=[None, 1], + dtype='int32') + label_length = fluid.layers.data(name='labels_length', shape=[None], + dtype='int64') + cost = fluid.layers.warpctc(input=logits, label=label, + input_length=logits_length, label_length=label_length) - + place = fluid.CPUPlace() + batch_size = 2 + x = np.random.rand(max_seq_length, batch_size, 5).astype("float32") + y = np.random.randint(0, 5, [max_seq_length * batch_size, 1]).astype("int32") + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + output= exe.run(feed={"logits": x, + "label": y, + "logits_length": np.array([5, 4]).astype("int64"), + "labels_length": np.array([3, 2]).astype("int64")}, + fetch_list=[cost.name]) + print(output) """ helper = LayerHelper('warpctc', **locals()) this_inputs = {'Logits': [input], 'Label': [label]} @@ -10665,77 +10698,16 @@ def affine_grid(theta, out_shape, name=None): the input feature map should be sampled to produce the transformed output feature map. - .. code-block:: text - - * Case 1: - - Given: - - theta = [[[x_11, x_12, x_13] - [x_14, x_15, x_16]] - [[x_21, x_22, x_23] - [x_24, x_25, x_26]]] - - out_shape = [2, 3, 5, 5] - - Step 1: - - Generate normalized coordinates according to out_shape. - The values of the normalized coordinates are in the interval between -1 and 1. - The shape of the normalized coordinates is [2, H, W] as below: - - C = [[[-1. -1. -1. -1. -1. ] - [-0.5 -0.5 -0.5 -0.5 -0.5] - [ 0. 0. 0. 0. 0. ] - [ 0.5 0.5 0.5 0.5 0.5] - [ 1. 1. 1. 1. 1. ]] - [[-1. -0.5 0. 0.5 1. ] - [-1. -0.5 0. 0.5 1. ] - [-1. -0.5 0. 0.5 1. ] - [-1. -0.5 0. 0.5 1. ] - [-1. -0.5 0. 0.5 1. ]]] - C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. - - Step2: - - Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: - C_ = [[-1. -1. 1. ] - [-0.5 -1. 1. ] - [ 0. -1. 1. ] - [ 0.5 -1. 1. ] - [ 1. -1. 1. ] - [-1. -0.5 1. ] - [-0.5 -0.5 1. ] - [ 0. -0.5 1. ] - [ 0.5 -0.5 1. ] - [ 1. -0.5 1. ] - [-1. 0. 1. ] - [-0.5 0. 1. ] - [ 0. 0. 1. ] - [ 0.5 0. 1. ] - [ 1. 0. 1. ] - [-1. 0.5 1. ] - [-0.5 0.5 1. ] - [ 0. 0.5 1. ] - [ 0.5 0.5 1. ] - [ 1. 0.5 1. ] - [-1. 1. 1. ] - [-0.5 1. 1. ] - [ 0. 1. 1. ] - [ 0.5 1. 1. ] - [ 1. 1. 1. ]] - Step3: - Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ - - Args: - theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. - out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. - ``out_shape`` can be a Variable or a list or tuple. - name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + Args: + theta (Variable) - A Tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters. + The data type can be float32 or float64. + out_shape (Variable | list | tuple): The shape of target output with format [batch_size, channel, height, width]. + ``out_shape`` can be a Tensor or a list or tuple. The data + type must be int32. + name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: - Variable: The output with shape [N, H, W, 2]. + Variable: A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`. Raises: ValueError: If the type of arguments is not supported. @@ -10745,13 +10717,20 @@ def affine_grid(theta, out_shape, name=None): .. code-block:: python import paddle.fluid as fluid - theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32") - out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32") - data = fluid.layers.affine_grid(theta, out_shape) - - # or - data = fluid.layers.affine_grid(theta, [5, 3, 28, 28]) - + import numpy as np + place = fluid.CPUPlace() + theta = fluid.data(name="x", shape=[None, 2, 3], dtype="float32") + out_shape = fluid.data(name="y", shape=[4], dtype="int32") + grid_0 = fluid.layers.affine_grid(theta, out_shape) + grid_1 = fluid.layers.affine_grid(theta, [5, 3, 28, 28]) + batch_size=2 + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + output= exe.run(feed={"x": np.random.rand(batch_size,2,3).astype("float32"), + "y": np.array([5, 3, 28, 28]).astype("int32")}, + fetch_list=[grid_0.name, grid_1.name]) + print(output[0]) + print(output[1]) """ helper = LayerHelper('affine_grid')