Enhance detection_map_op and more check in prior_box API. (#10796)

1. If all bboxes are not difficult ground truth, the users can not define the data layer for this flag and not the input can be None for detection_map API. 2. Set default value for aspect_ratios in prior_box API. 3. Add more check in prior_box API.

Enhance detection_map_op and more check in prior_box API. (#10796)
1. If all bboxes are not difficult ground truth, the users can not define the data layer for this flag and not the input can be None for detection_map API. 2. Set default value for aspect_ratios in prior_box API. 3. Add more check in prior_box API.
2a77fc50 · qingqing01 · GitHub · be26b71b · 2a77fc50 · 2a77fc50
5 changed file
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -51,7 +51,8 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                      "The rank of Input(Label) must be 2, "
                      "the shape is [N, 6].");
-    PADDLE_ENFORCE_EQ(label_dims[1], 6, "The shape is of Input(Label) [N, 6].");
+    PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
+                   "The shape of Input(Label) is [N, 6] or [N, 5].");

    if (ctx->HasInput("PosCount")) {
      PADDLE_ENFORCE(ctx->HasInput("TruePos"),
@@ -88,9 +89,10 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
             "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
             "no detected data.");
    AddInput("Label",
-             "(LoDTensor) A 2-D LoDTensor with shape[N, 6] represents the"
+             "(LoDTensor) A 2-D LoDTensor represents the"
             "Labeled ground-truth data. Each row has 6 values: "
-             "[label, is_difficult, xmin, ymin, xmax, ymax], N is the total "
+             "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: "
+             "[label, xmin, ymin, xmax, ymax], where N is the total "
             "number of ground-truth data in this mini-batch. For each "
             "instance, the offsets in first dimension are called LoD, "
             "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "

--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -72,7 +72,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");

    float overlap_threshold = ctx.Attr<float>("overlap_threshold");
-    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
+    bool evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
    int class_num = ctx.Attr<int>("class_num");

@@ -175,14 +175,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    for (int n = 0; n < batch_size; ++n) {
      std::map<int, std::vector<Box>> boxes;
      for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) {
-        Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
        int label = labels(i, 0);
-        auto is_difficult = labels(i, 1);
-        if (std::abs(is_difficult - 0.0) < 1e-6)
-          box.is_difficult = false;
-        else
-          box.is_difficult = true;
-        boxes[label].push_back(box);
+        if (input_label.dims()[1] == 6) {
+          Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
+          auto is_difficult = labels(i, 1);
+          if (std::abs(is_difficult - 0.0) < 1e-6)
+            box.is_difficult = false;
+          else
+            box.is_difficult = true;
+          boxes[label].push_back(box);
+        } else {
+          PADDLE_ENFORCE_EQ(input_label.dims()[1], 5);
+          Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4));
+          boxes[label].push_back(box);
+        }
      }
      gt_boxes->push_back(boxes);
    }

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -273,10 +273,11 @@ class DetectionMAP(Evaluator):
            [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
        gt_label (Variable): The ground truth label index, which is a LoDTensor
            with shape [N, 1].
-        gt_difficult (Variable): Whether this ground truth is a difficult
-            bounding box (bbox), which is a LoDTensor [N, 1].
        gt_box (Variable): The ground truth bounding box (bbox), which is a
            LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+        gt_difficult (Variable|None): Whether this ground truth is a difficult
+            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
+            it means all the ground truth labels are not difficult bbox.
        class_num (int): The class number.
        background_label (int): The index of background label, the background
            label will be ignored. If set to -1, then all categories will be
@@ -284,7 +285,8 @@ class DetectionMAP(Evaluator):
        overlap_threshold (float): The threshold for deciding true/false
            positive, 0.5 by defalut.
        evaluate_difficult (bool): Whether to consider difficult ground truth
-            for evaluation, True by defalut.
+            for evaluation, True by defalut. This argument does not work when
+            gt_difficult is None.
        ap_version (string): The average precision calculation ways, it must be
            'integral' or '11point'. Please check
            https://sanchom.wordpress.com/tag/average-precision/ for details.
@@ -295,7 +297,7 @@ class DetectionMAP(Evaluator):

        exe = fluid.executor(place)
        map_evaluator = fluid.Evaluator.DetectionMAP(input,
-            gt_label, gt_difficult, gt_box)
+            gt_label, gt_box, gt_difficult)
        cur_map, accum_map = map_evaluator.get_map_var()
        fetch = [cost, cur_map, accum_map]
        for epoch in PASS_NUM:
@@ -313,8 +315,8 @@ class DetectionMAP(Evaluator):
                 input,
                 gt_label,
                 gt_box,
-                 gt_difficult,
-                 class_num,
+                 gt_difficult=None,
+                 class_num=None,
                 background_label=0,
                 overlap_threshold=0.5,
                 evaluate_difficult=True,
@@ -322,8 +324,11 @@ class DetectionMAP(Evaluator):
        super(DetectionMAP, self).__init__("map_eval")

        gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
-        gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
-        label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        if gt_difficult:
+            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        else:
+            label = layers.concat([gt_label, gt_box], axis=1)

        # calculate mean average precision (mAP) of current mini-batch
        map = layers.detection_map(

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -569,7 +569,7 @@ def prior_box(input,
              image,
              min_sizes,
              max_sizes=None,
-              aspect_ratios=None,
+              aspect_ratios=[1.],
              variance=[0.1, 0.1, 0.2, 0.2],
              flip=False,
              clip=False,
@@ -589,19 +589,19 @@ def prior_box(input,
       input(Variable): The Input Variables, the format is NCHW.
       image(Variable): The input image data of PriorBoxOp,
            the layout is NCHW.
-       min_sizes(list|tuple): min sizes of generated prior boxes.
+       min_sizes(list|tuple|float value): min sizes of generated prior boxes.
       max_sizes(list|tuple|None): max sizes of generated prior boxes.
            Default: None.
-       aspect_ratios(list|tuple): the aspect ratios of generated prior
-            boxes. Default: None.
+       aspect_ratios(list|tuple|float value): the aspect ratios of generated
+            prior boxes. Default: [1.].
       variance(list|tuple): the variances to be encoded in prior boxes.
            Default:[0.1, 0.1, 0.2, 0.2].
       flip(bool): Whether to flip aspect ratios. Default:False.
       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
-       step(list|turple): Prior boxes step across weight and height, If
+       step(list|turple): Prior boxes step across width and height, If
            step[0] == 0.0/step[1] == 0.0, the prior boxes step across
-            height/weight  of the input will be automatically calculated.
-            Default: [0.0]
+            height/weight of the input will be automatically calculated.
+            Default: [0., 0.]
       offset(float): Prior boxes center offset. Default: 0.5
       name(str): Name of the prior box op. Default: None.

@@ -630,6 +630,21 @@ def prior_box(input,
    helper = LayerHelper("prior_box", **locals())
    dtype = helper.input_dtype()

+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    min_sizes = list(map(float, min_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    steps = list(map(float, steps))
+
    attrs = {
        'min_sizes': min_sizes,
        'aspect_ratios': aspect_ratios,
@@ -641,6 +656,8 @@ def prior_box(input,
        'offset': offset
    }
    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
        attrs['max_sizes'] = max_sizes

    box = helper.create_tmp_variable(dtype)

--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -160,7 +160,9 @@ class TestDetectionMAPOp(OpTest):
        label_count, true_pos, false_pos = get_input_pos(
            self.class_pos_count, self.true_pos, self.true_pos_lod,
            self.false_pos, self.false_pos_lod)
-        for (label, difficult, xmin, ymin, xmax, ymax) in self.label:
+        for v in self.label:
+            label = v[0]
+            difficult = False if len(v) == 5 else v[1]
            if self.evaluate_difficult:
                label_count[label] += 1
            elif not difficult:
@@ -245,6 +247,15 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
                       [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]


+class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOpWithoutDiff, self).init_test_case()
+
+        # label xmin ymin xmax ymax
+        self.label = [[1, 0.1, 0.1, 0.3, 0.3], [1, 0.6, 0.6, 0.8, 0.8],
+                      [2, 0.3, 0.3, 0.6, 0.5], [1, 0.7, 0.1, 0.9, 0.3]]
+
+
 class TestDetectionMAPOp11Point(TestDetectionMAPOp):
    def init_test_case(self):
        super(TestDetectionMAPOp11Point, self).init_test_case()