fix adaptive_pool and yolov3_loss. test=develop

144016fc · dengkaipeng · eb65b4e4 · 144016fc · 144016fc · 144016fc
4 changed file
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -144,30 +144,36 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
                   "The ignore threshold to ignore confidence loss.")
        .SetDefault(0.7);
    AddComment(R"DOC(
-         This operator generate yolov3 loss by given predict result and ground
+         This operator generates yolov3 loss based on given predict result and ground
         truth boxes.
         
         The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, specify the grid size, each grid point predict given
-         number boxes, this given number is specified by anchors, it should be 
-         half anchors length, which following will be represented as S. In the 
-         second dimention(the channel dimention), C should be S * (class_num + 5),
-         class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimention, stores 4 box location coordinates x, y, w, h 
-         and confidence score of the box and class one-hot key of each anchor box.
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors, In the second dimension(the channel
+         dimension), C should be equal to S * (class_num + 5), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so in the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor box.

-         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
-         correspnd to:
+         Assume the 4 location coordinates is :math:`t_x, t_y, t_w, t_h`, the box predictions
+         should be following:

         $$
-         b_x = \sigma(t_x) + c_x
-         b_y = \sigma(t_y) + c_y
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
         b_w = p_w e^{t_w}
+         $$
+         $$
         b_h = p_h e^{t_h}
         $$

-         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
-         is specified by anchors.
+         In the equaltion above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.

         As for confidence score, it is the logistic regression value of IoU between
         anchor boxes and ground truth boxes, the score of the anchor box which has 

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -260,34 +260,39 @@ Example:
       $$

  For exclusive = false:
-
-  ..  math::
-
-       hstart &= i * strides[0] - paddings[0] \\
-       hend &= hstart + ksize[0] \\
-       wstart &= j * strides[1] - paddings[1] \\
-       wend &= wstart + ksize[1] \\
-       Output(i ,j) &= \frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
+       $$
+       hstart = i * strides[0] - paddings[0]
+       $$
+       $$
+       hend = hstart + ksize[0]
+       $$
+       $$
+       wstart = j * strides[1] - paddings[1]
+       $$
+       $$
+       wend = wstart + ksize[1]
+       $$
+       $$
+       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
+       $$

  For exclusive = true:
+       $$
+       hstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
+       hend = min(H, hstart + ksize[0])
+       $$
+       $$
+       wstart = max(0, j * strides[1] - paddings[1])
+       $$
+       $$
+       wend = min(W, wstart + ksize[1])
+       $$
+       $$
+       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+       $$

-  ..  math::
-
-       hstart &= max(0, i * strides[0] - paddings[0]) \\
-       hend &= min(H, hstart + ksize[0]) \\
-       wstart &= max(0, j * strides[1] - paddings[1]) \\
-       wend &= min(W, wstart + ksize[1]) \\
-       Output(i ,j) &= \frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-
-  For adaptive = true:
-
-  ..  math::
-
-       hstart &= floor(i * H_{in} / H_{out}) \\
-       hend &= ceil((i + 1) * H_{in} / H_{out}) \\
-       wstart &= floor(j * W_{in} / W_{out}) \\
-       wend &= ceil((j + 1) * W_{in} / W_{out}) \\
-       Output(i ,j) &= \frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
 )DOC");
 }

@@ -417,39 +422,47 @@ Example:
       $$

  For exclusive = false:
-
-  ..  math::
-
-      dstart &= i * strides[0] - paddings[0] \\
-      dend &= dstart + ksize[0] \\
-      hstart &= j * strides[1] - paddings[1] \\
-      hend &= hstart + ksize[1] \\
-      wstart &= k * strides[2] - paddings[2] \\
-      wend &= wstart + ksize[2] \\
-      Output(i ,j, k) &= \frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+       $$
+       dstart = i * strides[0] - paddings[0]
+       $$
+       $$
+       dend = dstart + ksize[0]
+       $$
+       $$
+       hstart = j * strides[1] - paddings[1]
+       $$
+       $$
+       hend = hstart + ksize[1]
+       $$
+       $$
+       wstart = k * strides[2] - paddings[2]
+       $$
+       $$
+       wend = wstart + ksize[2]
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+       $$

  For exclusive = true:
-
-  ..  math::
-
-      dstart &= max(0, i * strides[0] - paddings[0]) \\
-      dend &= min(D, dstart + ksize[0]) \\
-      hend &= min(H, hstart + ksize[1]) \\
-      wstart &= max(0, k * strides[2] - paddings[2]) \\
-      wend &= min(W, wstart + ksize[2]) \\
-      Output(i ,j, k) &= \frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-
-  For adaptive = true:
-
-  ..  math::
-
-      dstart &= floor(i * D_{in} / D_{out}) \\
-      dend &= ceil((i + 1) * D_{in} / D_{out}) \\
-      hstart &= floor(j * H_{in} / H_{out}) \\
-      hend &= ceil((j + 1) * H_{in} / H_{out}) \\
-      wstart &= floor(k * W_{in} / W_{out}) \\
-      wend &= ceil((k + 1) * W_{in} / W_{out}) \\
-      Output(i ,j, k) &= \frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+       $$
+       dstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
+       dend = min(D, dstart + ksize[0])
+       $$
+       $$
+       hend = min(H, hstart + ksize[1])
+       $$
+       $$
+       wstart = max(0, k * strides[2] - paddings[2])
+       $$
+       $$
+       wend = min(W, wstart + ksize[2])
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+       $$

 )DOC");
 }

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -545,15 +545,16 @@ def yolov3_loss(x,
        TypeError: Attr ignore_thresh of yolov3_loss must be a float number

    Examples:
-    .. code-block:: python
-
-        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-        gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-        gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-        anchors = [0, 1, 2]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
-                                        ignore_thresh=0.5, downsample_ratio=32)
+      .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+          anchor_mask = [0, 1, 2]
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+                                          anchor_mask=anchor_mask, class_num=80,
+                                          ignore_thresh=0.7, downsample_ratio=32)
    """
    helper = LayerHelper('yolov3_loss', **locals())


--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2577,6 +2577,20 @@ def adaptive_pool2d(input,
    represent height and width, respectively. Also the H and W dimensions of output(Out)
    is same as Parameter(pool_size).

+    For average adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+
    Args:
        input (Variable): The input tensor of pooling operator. The format of
                          input tensor is NCHW, where N is batch size, C is
@@ -2675,6 +2689,24 @@ def adaptive_pool3d(input,
    three elements which represent height and width, respectively. Also the D, H and W
    dimensions of output(Out) is same as Parameter(pool_size).

+    For average adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+
    Args:
        input (Variable): The input tensor of pooling operator. The format of
                          input tensor is NCDHW, where N is batch size, C is