refine deformable roi pooling doc (#19944)

* refine doc, test=develop, test=document_preview

refine deformable roi pooling doc (#19944)
* refine doc, test=develop, test=document_preview
0099e549 · chengjuntao · GitHub · b1bb2384 · 0099e549 · 0099e549
隐藏空白更改
内联并排

Showing with 75 addition and 33 deletion

paddle/fluid/API.spec paddle/fluid/API.spec +1 -1

python/paddle/fluid/layers/nn.py python/paddle/fluid/layers/nn.py +74 -32

未找到文件。
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -294,7 +294,7 @@ paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=No
 paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'fa2f457a81714430c5677c2d68744728'))
 paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'modulated', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, True, None)), ('document', '335193ac57d41d7199f8d26d30c069b1'))
 paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6'))
-paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '99c03e3f249e36854f87dedaa17c8f35'))
+paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '47c5d1c890b36fa00ff3285c9398f613'))
 paddle.fluid.layers.filter_by_instag (ArgSpec(args=['ins', 'ins_tag', 'filter_tag', 'is_lod'], varargs=None, keywords=None, defaults=None), ('document', '7703a2088af8de4128b143ff1164ca4a'))
 paddle.fluid.layers.shard_index (ArgSpec(args=['input', 'index_num', 'nshards', 'shard_id', 'ignore_value'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'c4969dd6bf164f9e6a90414ea4f4e5ad'))
 paddle.fluid.layers.hard_swish (ArgSpec(args=['x', 'threshold', 'scale', 'offset', 'name'], varargs=None, keywords=None, defaults=(6.0, 6.0, 3.0, None)), ('document', '6a5152a7015c62cb8278fc24cb456459'))

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -14197,43 +14197,85 @@ def deformable_roi_pooling(input,
                           position_sensitive=False,
                           name=None):
    """
-    Deformable PSROI Pooling Layer
+    Deformable ROI Pooling Layer
+    Performs deformable region-of-interest pooling on inputs. As described
+    in `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_, it will get offset for each bin after 
+    roi pooling so that pooling at correct region. Batch_size will change to the number of region bounding boxes after deformable_roi_pooling.
+    The operation has three steps:
-    Args:
+    1. Dividing each region proposal into equal-sized sections with the pooled_width and pooled_height.
-       input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is 
-                        [N,C,H,W]. Where N is batch size,C is number of input channels,H 
+    2. Add offset to pixel in ROI to get new location and the new value which are computed directly through
-                        is height of the feature, and W is the width of the feature.
+       bilinear interpolation with four nearest pixel.
-       rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                        a 2-D LoDTensor of shape (num_rois, 4), the lod level
+    3. Sample several points in each bin to get average values as output.
-                        is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                        the top left coordinates, and (x2, y2) is the bottom
-                        right coordinates.
+    Args:
-       trans (Variable): Offset of features on ROIs while pooling.The format is NCHW, where 
+        input (Variable):The input of deformable roi pooling and it is tensor which value type is float32. The shape of input is
-                         N is number of ROIs, C is number of channels, which indicate the offset distance 
+                         [N, C, H, W]. Where N is batch size, C is number of input channels,
-                         in the x and y directions, H is pooled height, and W is pooled width.
+                         H is height of the feature, and W is the width of the feature.
-       no_trans (bool): Whether to add offset to get new value or not while roi pooling, which 
+        rois (Variable): ROIs (Regions of Interest) with type float32 to pool over. It should be
-                          value is True or False. Default: False.
+                         a 2-D LoDTensor of shape (num_rois, 4), and the lod level
-       spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width).
+                         is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                             Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
+                         the top left coordinates, and (x2, y2) is the bottom
-       group_size (list|tuple): The number of groups which input channels are divided.(eg.number of input channels 
+                         right coordinates, which value type is float32.
-                         is k1*k2*(C+1), which k1 and k2 are group width and height and C+1 is number of output
+        trans (Variable): Offset of features on ROIs while pooling which value type is float32. The format is [N, C, H, W], where 
-                         chanels. eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
+                          N is number of ROIs, C is number of channels, which indicate the offset distance 
-       pooled_height (integer): The pooled output height. Default: 1.
+                          in the x and y directions, H is pooled height, and W is pooled width. 
-       pooled_width (integer): The pooled output width. Default: 1.
+        no_trans (bool): Whether to add offset to get new value or not while roi pooling, which value with type bool is True or False.
-       part_size (list|tuple): The height and width of offset, eg.(4, 6), which height is 4 and width is 6, Default: 
+                         If value is True, no offset will be added in operation. Default: False.
-                        if None, default value is [pooled_height, pooled_width].
+        spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width), which value type is float32.
-       sample_per_part (integer): The number of samples in each bin. Default: 1.
+                         Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
-       trans_std (float): Coefficient of offset. Default: 0.1.
+        group_size (list|tuple): The number of groups which input channels are divided and the input is list or tuple, which value type is int32. (eg.number of input channels 
-       position_sensitive (bool): Whether to choose deformable psroi pooling mode or not. Default: False.
+                          is k1 * k2 * (C + 1), which k1 and k2 are group width and height and C+1 is number of output
-       name (str): Name of layer. Default: None.
+                          chanels.) eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
-    Returns:
+        pooled_height (int): The pooled output height which value type is int32. Default: 1.
-        Variable: The tensor variable storing the deformable psroi pooling \
+        pooled_width (int): The pooled output width which value type is int32. Default: 1.
-                  result.
+        part_size (list|tuple): The height and width of offset which values in list or tuple is int32, eg.(4, 6), which height is 4 and width is 6, and values always equal to pooled_height \
+                         and pooled_width. Default: if None, default value is [pooled_height, pooled_width].
+        sample_per_part (int): The number of samples in each bin which value type is int32. If value is bigger, it will consume more performance. Default: 1.
+        trans_std (float): Coefficient of offset which value type is float32. It controls weight of offset. Default: 0.1.
+        position_sensitive (bool): Whether to choose deformable psroi pooling mode or not, and value type is bool(True or False). If value is False, input dimension equals to output dimension. \
+                                   If value is True, input dimension shoule be output dimension * pooled_height * pooled_width. Default: False.
+        name (str|None): Name of layer. Default: None.
+    Returns:
+        Variable: Output of deformable roi pooling is that, if position sensitive is False, input dimension equals to output dimension. If position sensitive is True,\
+                  input dimension should be the result of output dimension divided by pooled height and pooled width.
    Examples:
      .. code-block:: python
+        # position_sensitive=True
+        import paddle.fluid as fluid
+        input = fluid.layers.data(name="input",
+                                  shape=[2, 192, 64, 64], 
+                                  dtype='float32', 
+                                  append_batch_size=False)                   
+        rois = fluid.layers.data(name="rois",
+                                 shape=[4],
+                                 dtype='float32', 
+                                 lod_level=1)
+        trans = fluid.layers.data(name="trans",
+                                  shape=[2, 384, 64, 64], 
+                                  dtype='float32', 
+                                  append_batch_size=False) 
+        x = fluid.layers.nn.deformable_roi_pooling(input=input, 
+                                                     rois=rois, 
+                                                     trans=trans, 
+                                                     no_trans=False,
+                                                     spatial_scale=1.0, 
+                                                     group_size=(1, 1),
+                                                     pooled_height=8,
+                                                     pooled_width=8,
+                                                     part_size=(8, 8),
+                                                     sample_per_part=4, 
+                                                     trans_std=0.1,
+                                                     position_sensitive=True)
+        # position_sensitive=False
        import paddle.fluid as fluid
        input = fluid.layers.data(name="input",
                                  shape=[2, 192, 64, 64],