From b3d26cd3adb2a8979179a52b4765582bc23bc59f Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 12 Mar 2018 15:03:04 +0800
Subject: [PATCH] Fix bug in detection_output and mAP calculation in SSD.
 (#8985)

* Clipping bbox in the mAP evaluator calculation.

* Fix bug in detection_output and mAP calculation in SSD.

* Fix bug in detection.py.

* Fix bug in test_detection_map_op.py.
---
 paddle/fluid/operators/detection_map_op.h     | 10 ++--
 paddle/fluid/operators/prior_box_op.cc        |  3 +-
 paddle/fluid/operators/prior_box_op.h         | 45 ++++++----------
 python/paddle/fluid/layers/detection.py       | 51 ++++++++++---------
 .../tests/unittests/test_detection_map_op.py  |  2 -
 5 files changed, 50 insertions(+), 61 deletions(-)
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index a009e9dfc..8c15bfa36 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -273,7 +273,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
                    std::map<int, std::vector<std::pair<T, int>>>& true_pos,
                    std::map<int, std::vector<std::pair<T, int>>>& false_pos,
                    const int class_num) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
     const int* pos_count_data = input_pos_count.data<int>();
     for (int i = 0; i < class_num; ++i) {
       label_pos_count[i] = pos_count_data[i];
@@ -282,12 +281,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto SetData = [](const framework::LoDTensor& pos_tensor,
                       std::map<int, std::vector<std::pair<T, int>>>& pos) {
       const T* pos_data = pos_tensor.data<T>();
-      auto pos_data_lod = pos_tensor.lod();
-      for (size_t i = 0; i < pos_data_lod.size(); ++i) {
-        for (size_t j = pos_data_lod[0][i]; j < pos_data_lod[0][i + 1]; ++j) {
+      auto pos_data_lod = pos_tensor.lod()[0];
+      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
+        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
           T score = pos_data[j * 2];
-          int flag = 1;
-          if (pos_data[j * 2 + 1] < kEPS) flag = 0;
+          int flag = pos_data[j * 2 + 1];
           pos[i].push_back(std::make_pair(score, flag));
         }
       }
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index be7898c22..7ba55437c 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -111,7 +111,8 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<std::vector<float>>(
         "max_sizes",
-        "(vector<float>) List of max sizes of generated prior boxes.");
+        "(vector<float>) List of max sizes of generated prior boxes.")
+        .SetDefault(std::vector<float>{});
     AddAttr<std::vector<float>>(
         "aspect_ratios",
         "(vector<float>) List of aspect ratios of generated prior boxes.");
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 0113d2f09..18bb2deb6 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -97,9 +97,6 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
-    T inv_img_width = 1.0 / img_width;
-    T inv_img_height = 1.0 / img_height;
-
     auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
@@ -110,36 +107,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
           // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size;
+          box_width = box_height = min_size / 2.;
           // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
+          e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
           // ymin
-          e_boxes(h, w, idx, 1) =
-              (center_y - box_height * 0.5) * inv_img_height;
+          e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
           // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
+          e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
           // ymax
-          e_boxes(h, w, idx, 3) =
-              (center_y + box_height * 0.5) * inv_img_height;
+          e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
 
           idx++;
           if (max_sizes.size() > 0) {
             auto max_size = max_sizes[s];
             // second prior: aspect_ratio = 1,
             // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size);
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
             // xmin
-            e_boxes(h, w, idx, 0) =
-                (center_x - box_width * 0.5) * inv_img_width;
+            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
             // ymin
-            e_boxes(h, w, idx, 1) =
-                (center_y - box_height * 0.5) * inv_img_height;
+            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
             // xmax
-            e_boxes(h, w, idx, 2) =
-                (center_x + box_width * 0.5) * inv_img_width;
+            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
             // ymax
-            e_boxes(h, w, idx, 3) =
-                (center_y + box_height * 0.5) * inv_img_height;
+            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
           }
 
@@ -149,20 +140,16 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             if (fabs(ar - 1.) < 1e-6) {
               continue;
             }
-            box_width = min_size * sqrt(ar);
-            box_height = min_size / sqrt(ar);
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
             // xmin
-            e_boxes(h, w, idx, 0) =
-                (center_x - box_width * 0.5) * inv_img_width;
+            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
             // ymin
-            e_boxes(h, w, idx, 1) =
-                (center_y - box_height * 0.5) * inv_img_height;
+            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
             // xmax
-            e_boxes(h, w, idx, 2) =
-                (center_x + box_width * 0.5) * inv_img_width;
+            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
             // ymax
-            e_boxes(h, w, idx, 3) =
-                (center_y + box_height * 0.5) * inv_img_height;
+            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
           }
         }
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 2bf7cf21c..ea189749b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -130,8 +130,13 @@ def detection_output(loc,
         target_box=loc,
         code_type='decode_center_size')
 
-    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    old_shape = scores.shape
+    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = ops.softmax(x=scores)
+    scores = ops.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
+
+    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
         inputs={'Scores': scores,
@@ -562,16 +567,16 @@ def multi_box_head(inputs,
                    base_size,
                    num_classes,
                    aspect_ratios,
-                   min_ratio,
-                   max_ratio,
+                   min_ratio=None,
+                   max_ratio=None,
                    min_sizes=None,
                    max_sizes=None,
                    steps=None,
                    step_w=None,
                    step_h=None,
                    offset=0.5,
-                   variance=[0.1, 0.1, 0.1, 0.1],
-                   flip=False,
+                   variance=[0.1, 0.1, 0.2, 0.2],
+                   flip=True,
                    clip=False,
                    kernel_size=1,
                    pad=0,
@@ -614,7 +619,7 @@ def multi_box_head(inputs,
             the inputs[i] will be automatically calculated. Default: None.
        offset(float): Prior boxes center offset. Default: 0.5
        variance(list|tuple): the variances to be encoded in prior boxes.
-            Default:[0.1, 0.1, 0.1, 0.1].
+            Default:[0.1, 0.1, 0.2, 0.2].
        flip(bool): Whether to flip aspect ratios. Default:False.
        clip(bool): Whether to clip out-of-boundary boxes. Default: False.
        kernel_size(int): The kernel size of conv2d. Default: 1.
@@ -668,6 +673,19 @@ def multi_box_head(inputs,
         helper = LayerHelper("prior_box", **locals())
         dtype = helper.input_dtype()
 
+        attrs = {
+            'min_sizes': min_sizes,
+            'aspect_ratios': aspect_ratios,
+            'variances': variance,
+            'flip': flip,
+            'clip': clip,
+            'step_w': step_w,
+            'step_h': step_h,
+            'offset': offset
+        }
+        if len(max_sizes) > 0 and max_sizes[0] > 0:
+            attrs['max_sizes'] = max_sizes
+
         box = helper.create_tmp_variable(dtype)
         var = helper.create_tmp_variable(dtype)
         helper.append_op(
@@ -676,17 +694,7 @@ def multi_box_head(inputs,
                     "Image": image},
             outputs={"Boxes": box,
                      "Variances": var},
-            attrs={
-                'min_sizes': min_sizes,
-                'max_sizes': max_sizes,
-                'aspect_ratios': aspect_ratios,
-                'variances': variance,
-                'flip': flip,
-                'clip': clip,
-                'step_w': step_w,
-                'step_h': step_h,
-                'offset': offset
-            })
+            attrs=attrs, )
         return box, var
 
     def _reshape_with_axis_(input, axis=1):
@@ -714,7 +722,7 @@ def multi_box_head(inputs,
     if num_layer <= 2:
         assert min_sizes is not None and max_sizes is not None
         assert len(min_sizes) == num_layer and len(max_sizes) == num_layer
-    else:
+    elif min_sizes is None and max_sizes is None:
         min_sizes = []
         max_sizes = []
         step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
@@ -759,9 +767,6 @@ def multi_box_head(inputs,
             min_size = [min_size]
         if not _is_list_or_tuple_(max_size):
             max_size = [max_size]
-        if not (len(max_size) == len(min_size)):
-            raise ValueError(
-                'the length of max_size and min_size should be equal.')
 
         aspect_ratio = []
         if aspect_ratios is not None:
@@ -779,7 +784,7 @@ def multi_box_head(inputs,
 
         num_boxes = box.shape[2]
 
-        # get box_loc
+        # get loc
         num_loc_output = num_boxes * 4
         mbox_loc = nn.conv2d(
             input=input,
@@ -796,7 +801,7 @@ def multi_box_head(inputs,
         mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
-        # get conf_loc
+        # get conf
         num_conf_output = num_boxes * num_classes
         conf_loc = nn.conv2d(
             input=input,
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index f3197a623..a905a854a 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -166,8 +166,6 @@ class TestDetectionMAPOp(OpTest):
             elif not difficult:
                 label_count[label] += 1
 
-        true_pos = collections.defaultdict(list)
-        false_pos = collections.defaultdict(list)
         for (label, score, tp, fp) in tf_pos:
             true_pos[label].append([score, tp])
             false_pos[label].append([score, fp])
-- 
GitLab