未验证 提交 54df1ad4 编写于 作者: W whs 提交者: GitHub

Fix en doc of warpctc, affine_grid, generate_proposals (#20262)

上级 89c4b3dd
......@@ -177,7 +177,7 @@ paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized',
paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a'))
paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592'))
paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3'))
paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', 'a5be881ada816e47ea7a6ee4396da357'))
paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', '79aaea078ddea57a82ed7906d71dedc7'))
paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'eeb1591cfc854c6ffdac77b376313c44'))
paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa'))
paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', 'fe352915a543cec434f74e9b32ac49da'))
......@@ -277,7 +277,7 @@ paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_di
paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '7637c974f2d749d359acae9062c4d96f'))
paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '169882eb87fb693198e0153629134c22'))
paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '26decdea9376b6b9a0d3432d82ca207b'))
paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f85b263b7b6698d000977529a28f202b'))
paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '315b50c1cbd9569375b098c56f1e91c9'))
paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5b32ed21ab89140a8e758002923a0da3'))
paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110'))
paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb'))
......@@ -416,13 +416,13 @@ paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type',
paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'e9685f32d21bec8c013626c0254502c5'))
paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40'))
paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b'))
paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '651d98d51879dfa1bc1cd40391786a41'))
paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', 'd46629656b4ce9b07809e32c0482cbef'))
paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595'))
paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d'))
paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', 'd25e5e90f9a342764f32b5cd48657148'))
paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1'))
paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', '69def376b42ef0681d0cc7f53a2dac4b'))
paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', 'b7d707822b6af2a586bce608040235b1'))
paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'f2342042127b536a0a16390f149f1bba'))
paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '5cba014b41610431f8949e2d7336f1cc'))
paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'b319b10ddaf17fb4ddf03518685a17ef'))
paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '72fca4a39ccf82d5c746ae62d1868a99'))
paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '1d5144c3856673d05c29c752c7c8f821'))
......
......@@ -254,70 +254,66 @@ def rpn_target_assign(bbox_pred,
bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes. N is the batch size,
and each bounding box has four coordinate values and the layout
is [xmin, ymin, xmax, ymax].
is [xmin, ymin, xmax, ymax]. The data type can be float32 or float64.
cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
predicted confidence predictions. N is the batch size, 1 is the
frontground and background sigmoid, M is number of bounding boxes.
The data type can be float32 or float64.
anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
each box is represented as [xmin, ymin, xmax, ymax],
[xmin, ymin] is the left top coordinate of the anchor box,
if the input is image feature map, they are close to the origin
of the coordinate system. [xmax, ymax] is the right bottom
coordinate of the anchor box.
coordinate of the anchor box. The data type can be float32 or float64.
anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded
variances of anchors.
variances of anchors. The data type can be float32 or float64.
gt_boxes (Variable): The ground-truth bounding boxes (bboxes) are a 2D
LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
bboxes of mini-batch input.
bboxes of mini-batch input. The data type can be float32 or float64.
is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
The data type must be int32.
im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
3 is the height, width and scale.
rpn_batch_size_per_im(int): Total number of RPN examples per image.
The data type must be int32.
rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
by straddle_thresh pixels.
by straddle_thresh pixels. The data type must be float32.
rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
foreground (i.e. class > 0), 0-th class is background.
foreground (i.e. class > 0), 0-th class is background. The data type must be float32.
rpn_positive_overlap(float): Minimum overlap required between an anchor
and ground-truth box for the (anchor, gt box) pair to be a positive
example.
example. The data type must be float32.
rpn_negative_overlap(float): Maximum overlap allowed between an anchor
and ground-truth box for the (anchor, gt box) pair to be a negative
examples.
examples. The data type must be float32.
Returns:
tuple:
A tuple(predicted_scores, predicted_location, target_label,
target_bbox, bbox_inside_weight) is returned. The predicted_scores
and predicted_location is the predicted result of the RPN.
The target_label and target_bbox is the ground truth,
respectively. The predicted_location is a 2D Tensor with shape
[F, 4], and the shape of target_bbox is same as the shape of
the predicted_location, F is the number of the foreground
anchors. The predicted_scores is a 2D Tensor with shape
[F + B, 1], and the shape of target_label is same as the shape
of the predicted_scores, B is the number of the background
anchors, the F and B is depends on the input of this operator.
Bbox_inside_weight represents whether the predicted loc is fake_fg
or not and the shape is [F, 4].
A tuple(predicted_scores, predicted_location, target_label,
target_bbox, bbox_inside_weight) is returned. The predicted_scores
and predicted_location is the predicted result of the RPN.
The target_label and target_bbox is the ground truth,
respectively. The predicted_location is a 2D Tensor with shape
[F, 4], and the shape of target_bbox is same as the shape of
the predicted_location, F is the number of the foreground
anchors. The predicted_scores is a 2D Tensor with shape
[F + B, 1], and the shape of target_label is same as the shape
of the predicted_scores, B is the number of the background
anchors, the F and B is depends on the input of this operator.
Bbox_inside_weight represents whether the predicted loc is fake_fg
or not and the shape is [F, 4].
Examples:
.. code-block:: python
import paddle.fluid as fluid
bbox_pred = fluid.layers.data(name='bbox_pred', shape=[100, 4],
append_batch_size=False, dtype='float32')
cls_logits = fluid.layers.data(name='cls_logits', shape=[100, 1],
append_batch_size=False, dtype='float32')
anchor_box = fluid.layers.data(name='anchor_box', shape=[20, 4],
append_batch_size=False, dtype='float32')
anchor_var = fluid.layers.data(name='anchor_var', shape=[20, 4],
append_batch_size=False, dtype='float32')
gt_boxes = fluid.layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32')
is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
append_batch_size=False, dtype='float32')
im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
append_batch_size=False, dtype='float32')
bbox_pred = fluid.data(name='bbox_pred', shape=[None, 4], dtype='float32')
cls_logits = fluid.data(name='cls_logits', shape=[None, 1], dtype='float32')
anchor_box = fluid.data(name='anchor_box', shape=[None, 4], dtype='float32')
anchor_var = fluid.data(name='anchor_var', shape=[None, 4], dtype='float32')
gt_boxes = fluid.data(name='gt_boxes', shape=[None, 4], dtype='float32')
is_crowd = fluid.data(name='is_crowd', shape=[None], dtype='float32')
im_info = fluid.data(name='im_infoss', shape=[None, 3], dtype='float32')
loc, score, loc_target, score_target, inside_weight = fluid.layers.rpn_target_assign(
bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes, is_crowd, im_info)
......@@ -2217,8 +2213,7 @@ def generate_proposal_labels(rpn_rois,
is_cls_agnostic=False,
is_cascade_rcnn=False):
"""
** Generate Proposal Labels of Faster-RCNN **
**Generate Proposal Labels of Faster-RCNN**
This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
to sample foreground boxes and background boxes, and compute loss target.
......@@ -2236,37 +2231,43 @@ def generate_proposal_labels(rpn_rois,
Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss.
Args:
rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth.
is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd.
rpn_rois(Variable): A 2-D LoDTensor with shape [N, 4]. N is the number of the GenerateProposalOp's output, each element is a bounding box with [xmin, ymin, xmax, ymax] format. The data type can be float32 or float64.
gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a class label of groundtruth. The data type must be int32.
is_crowd(Variable): A 2-D LoDTensor with shape [M, 1]. M is the number of groundtruth, each element is a flag indicates whether a groundtruth is crowd. The data type must be int32.
gt_boxes(Variable): A 2-D LoDTensor with shape [M, 4]. M is the number of groundtruth, each element is a bounding box with [xmin, ymin, xmax, ymax] format.
im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the number of input images, each element consists of im_height, im_width, im_scale.
batch_size_per_im(int): Batch size of rois per images.
fg_fraction(float): Foreground fraction in total batch_size_per_im.
fg_thresh(float): Overlap threshold which is used to chose foreground sample.
bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample.
bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample.
bbox_reg_weights(list|tuple): Box regression weights.
class_nums(int): Class number.
batch_size_per_im(int): Batch size of rois per images. The data type must be int32.
fg_fraction(float): Foreground fraction in total batch_size_per_im. The data type must be float32.
fg_thresh(float): Overlap threshold which is used to chose foreground sample. The data type must be float32.
bg_thresh_hi(float): Overlap threshold upper bound which is used to chose background sample. The data type must be float32.
bg_thresh_lo(float): Overlap threshold lower bound which is used to chose background sample. The data type must be float32.
bbox_reg_weights(list|tuple): Box regression weights. The data type must be float32.
class_nums(int): Class number. The data type must be int32.
use_random(bool): Use random sampling to choose foreground and background boxes.
is_cls_agnostic(bool): bbox regression use class agnostic simply which only represent fg and bg boxes.
is_cascade_rcnn(bool): it will filter some bbox crossing the image's boundary when setting True.
Returns:
tuple:
A tuple with format``(rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights)``.
- **rois**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4]``. The data type is the same as ``rpn_rois``.
- **labels_int32**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 1]``. The data type must be int32.
- **bbox_targets**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The regression targets of all RoIs. The data type is the same as ``rpn_rois``.
- **bbox_inside_weights**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The weights of foreground boxes' regression loss. The data type is the same as ``rpn_rois``.
- **bbox_outside_weights**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The weights of regression loss. The data type is the same as ``rpn_rois``.
Examples:
.. code-block:: python
import paddle.fluid as fluid
rpn_rois = fluid.layers.data(name='rpn_rois', shape=[2, 4],
append_batch_size=False, dtype='float32')
gt_classes = fluid.layers.data(name='gt_classes', shape=[8, 1],
append_batch_size=False, dtype='float32')
is_crowd = fluid.layers.data(name='is_crowd', shape=[8, 1],
append_batch_size=False, dtype='float32')
gt_boxes = fluid.layers.data(name='gt_boxes', shape=[8, 4],
append_batch_size=False, dtype='float32')
im_info = fluid.layers.data(name='im_info', shape=[10, 3],
append_batch_size=False, dtype='float32')
rpn_rois = fluid.data(name='rpn_rois', shape=[None, 4], dtype='float32')
gt_classes = fluid.data(name='gt_classes', shape=[None, 1], dtype='float32')
is_crowd = fluid.data(name='is_crowd', shape=[None, 1], dtype='float32')
gt_boxes = fluid.data(name='gt_boxes', shape=[None, 4], dtype='float32')
im_info = fluid.data(name='im_info', shape=[None, 3], dtype='float32')
rois, labels, bbox, inside_weights, outside_weights = fluid.layers.generate_proposal_labels(
rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
class_nums=10)
......@@ -2496,44 +2497,47 @@ def generate_proposals(scores,
scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents
the probability for each box to be an object.
N is batch size, A is number of anchors, H and W are height and
width of the feature map.
width of the feature map. The data type must be float32.
bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
represents the differece between predicted box locatoin and
anchor location.
anchor location. The data type must be float32.
im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
image information for N batch. Info contains height, width and scale
between origin image size and the size of feature map.
The data type must be int32.
anchors(Variable): A 4-D Tensor represents the anchors with a layout
of [H, W, A, 4]. H and W are height and width of the feature map,
num_anchors is the box count of each position. Each anchor is
in (xmin, ymin, xmax, ymax) format an unnormalized.
variances(Variable): The expanded variances of anchors with a layout of
in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
variances(Variable): A 4-D Tensor. The expanded variances of anchors with a layout of
[H, W, num_priors, 4]. Each variance is in
(xcenter, ycenter, w, h) format.
(xcenter, ycenter, w, h) format. The data type must be float32.
pre_nms_top_n(float): Number of total bboxes to be kept per
image before NMS. 6000 by default.
image before NMS. The data type must be float32. `6000` by default.
post_nms_top_n(float): Number of total bboxes to be kept per
image after NMS. 1000 by default.
nms_thresh(float): Threshold in NMS, 0.5 by default.
image after NMS. The data type must be float32. `1000` by default.
nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.
min_size(float): Remove predicted boxes with either height or
width < min_size. 0.1 by default.
eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
adaptive_threshold = adaptive_threshold * eta in each iteration.
width < min_size. The data type must be float32. `0.1` by default.
eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
`adaptive_threshold = adaptive_threshold * eta` in each iteration.
Returns:
tuple:
A tuple with format ``(rpn_rois, rpn_roi_probs)``.
- **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
- **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
Examples:
.. code-block:: python
import paddle.fluid as fluid
scores = fluid.layers.data(name='scores', shape=[2, 4, 5, 5],
append_batch_size=False, dtype='float32')
bbox_deltas = fluid.layers.data(name='bbox_deltas', shape=[2, 16, 5, 5],
append_batch_size=False, dtype='float32')
im_info = fluid.layers.data(name='im_info', shape=[2, 3],
append_batch_size=False, dtype='float32')
anchors = fluid.layers.data(name='anchors', shape=[5, 5, 4, 4],
append_batch_size=False, dtype='float32')
variances = fluid.layers.data(name='variances', shape=[5, 5, 10, 4],
append_batch_size=False, dtype='float32')
scores = fluid.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
bbox_deltas = fluid.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
im_info = fluid.data(name='im_info', shape=[None, 3], dtype='float32')
anchors = fluid.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')
variances = fluid.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')
rois, roi_probs = fluid.layers.generate_proposals(scores, bbox_deltas,
im_info, anchors, variances)
......
......@@ -6761,7 +6761,7 @@ def warpctc(input,
(https://github.com/baidu-research/warp-ctc)
to compute Connectionist Temporal Classification (CTC) loss.
It can be aliased as softmax with CTC, since a native softmax activation is
interated to the Warp-CTC library, to to normlize values for each row of the
interated to the Warp-CTC library to normlize values for each row of the
input tensor.
Args:
......@@ -6773,14 +6773,15 @@ def warpctc(input,
(not including the blank label). When it is a 3-D Tensor, it's shape
is [max_logit_length, batch_size, num_classes + 1],
where max_logit_length is the length of the longest
input logit sequence.
input logit sequence. The data type must be float32.
label (Variable): The ground truth of variable-length sequence,
which is a 2-D Tensor with LoD information or a 2-D Tensor without
LoD information. When it is a 2-D LoDTensor or 2-D Tensor,
it is of the shape [Lg, 1], where Lg is th sum of all labels' length.
The data type must be int32.
blank (int, default 0): The blank label index of Connectionist
Temporal Classification (CTC) loss, which is in the
half-opened interval [0, num_classes + 1).
half-opened interval [0, num_classes + 1). The data type must be int32.
norm_by_times(bool, default false): Whether to normalize the gradients
by the number of time-step, which is also the sequence's length.
There is no need to normalize the gradients if warpctc layer was
......@@ -6792,40 +6793,72 @@ def warpctc(input,
Returns:
Variable: The Connectionist Temporal Classification (CTC) loss,
which is a 2-D Tensor of the shape [batch_size, 1].
which is a 2-D Tensor with the shape [batch_size, 1].
The date type is the same as input.
Examples:
.. code-block:: python
# using LoDTensor
import paddle.fluid as fluid
import numpy as np
label = fluid.layers.data(name='label', shape=[12, 1],
dtype='float32', lod_level=1)
predict = fluid.layers.data(name='predict',
shape=[11, 8],
predict = fluid.data(name='predict',
shape=[None, 5],
dtype='float32',lod_level=1)
label = fluid.data(name='label', shape=[None, 1],
dtype='int32', lod_level=1)
cost = fluid.layers.warpctc(input=predict, label=label)
place = fluid.CPUPlace()
x=fluid.LoDTensor()
data = np.random.rand(8, 5).astype("float32")
x.set(data, place)
x.set_lod([[0,4,8]])
y=fluid.LoDTensor()
data = np.random.randint(0, 5, [4, 1]).astype("int32")
y.set(data, place)
y.set_lod([[0,2,4]])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output= exe.run(feed={"predict": x,"label": y},
fetch_list=[cost.name])
print output
.. code-block:: python
# using Tensor
input_length = fluid.layers.data(name='logits_length', shape=[11],
dtype='int64')
label_length = fluid.layers.data(name='labels_length', shape=[12],
dtype='int64')
target = fluid.layers.data(name='target', shape=[12, 1],
dtype='int32')
import paddle.fluid as fluid
import numpy as np
# length of the longest logit sequence
max_seq_length = 4
max_seq_length = 5
# number of logit sequences
batch_size = 4
output = fluid.layers.data(name='output',
shape=[max_seq_length, batch_size, 8],
batch_size = None
logits = fluid.data(name='logits',
shape=[max_seq_length, batch_size, 5],
dtype='float32')
loss = fluid.layers.warpctc(input=output,label=target,
input_length=input_length,
logits_length = fluid.data(name='logits_length', shape=[None],
dtype='int64')
label = fluid.layers.data(name='label', shape=[None, 1],
dtype='int32')
label_length = fluid.layers.data(name='labels_length', shape=[None],
dtype='int64')
cost = fluid.layers.warpctc(input=logits, label=label,
input_length=logits_length,
label_length=label_length)
place = fluid.CPUPlace()
batch_size = 2
x = np.random.rand(max_seq_length, batch_size, 5).astype("float32")
y = np.random.randint(0, 5, [max_seq_length * batch_size, 1]).astype("int32")
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output= exe.run(feed={"logits": x,
"label": y,
"logits_length": np.array([5, 4]).astype("int64"),
"labels_length": np.array([3, 2]).astype("int64")},
fetch_list=[cost.name])
print(output)
"""
helper = LayerHelper('warpctc', **locals())
this_inputs = {'Logits': [input], 'Label': [label]}
......@@ -10643,77 +10676,16 @@ def affine_grid(theta, out_shape, name=None):
the input feature map should be sampled to produce the transformed
output feature map.
.. code-block:: text
* Case 1:
Given:
theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
out_shape = [2, 3, 5, 5]
Step 1:
Generate normalized coordinates according to out_shape.
The values of the normalized coordinates are in the interval between -1 and 1.
The shape of the normalized coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
Args:
theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
``out_shape`` can be a Variable or a list or tuple.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Args:
theta (Variable) - A Tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters.
The data type can be float32 or float64.
out_shape (Variable | list | tuple): The shape of target output with format [batch_size, channel, height, width].
``out_shape`` can be a Tensor or a list or tuple. The data
type must be int32.
name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Variable: The output with shape [N, H, W, 2].
Variable: A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`.
Raises:
ValueError: If the type of arguments is not supported.
......@@ -10723,13 +10695,20 @@ def affine_grid(theta, out_shape, name=None):
.. code-block:: python
import paddle.fluid as fluid
theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
data = fluid.layers.affine_grid(theta, out_shape)
# or
data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
import numpy as np
place = fluid.CPUPlace()
theta = fluid.data(name="x", shape=[None, 2, 3], dtype="float32")
out_shape = fluid.data(name="y", shape=[4], dtype="int32")
grid_0 = fluid.layers.affine_grid(theta, out_shape)
grid_1 = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
batch_size=2
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output= exe.run(feed={"x": np.random.rand(batch_size,2,3).astype("float32"),
"y": np.array([5, 3, 28, 28]).astype("int32")},
fetch_list=[grid_0.name, grid_1.name])
print(output[0])
print(output[1])
"""
helper = LayerHelper('affine_grid')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册