From 057efd1709db21744d672e3b1db74da561bd77ae Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 12 Feb 2018 16:21:44 +0800
Subject: [PATCH] Implement multibox loss wrapper for SSD in Python API.
 (#8385)

* Implement multibox loss wrapper in Python API.

* Add some wrappers for SSD detection.

* Fix conflicts.

* Add unit testing for SSD loss wrapper.

* Update doc in Python API.

* Refine unit testing.

* Add more unit testing and update some interface arguments.
---
 .../fluid/operators/mine_hard_examples_op.cc  |   2 +
 python/paddle/v2/fluid/layers/__init__.py     |   3 -
 python/paddle/v2/fluid/layers/detection.py    | 374 +++++++++++++++++-
 .../paddle/v2/fluid/tests/test_detection.py   |  81 +++-
 4 files changed, 424 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
index 73a6c0b67..540cf8674 100644
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -237,6 +237,8 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
     }
 
     ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+    // The first dimension of NegIndices will be set correcttly in Compute.
+    ctx->SetOutputDim("NegIndices", {-1, 1});
   }
 
  protected:
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
index cfbbf710b..f4fb2ca27 100644
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -16,8 +16,6 @@ import ops
 from ops import *
 import nn
 from nn import *
-import detection
-from detection import *
 import io
 from io import *
 import tensor
@@ -33,7 +31,6 @@ from detection import *
 
 __all__ = []
 __all__ += math_op_patch.__all__
-__all__ += detection.__all__
 __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__
diff --git a/python/paddle/v2/fluid/layers/detection.py b/python/paddle/v2/fluid/layers/detection.py
index 0f3256d76..659ebd5f7 100644
--- a/python/paddle/v2/fluid/layers/detection.py
+++ b/python/paddle/v2/fluid/layers/detection.py
@@ -1,10 +1,10 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,17 +15,31 @@
 All layers just related to the detection neural network.
 """
 
+from layer_function_generator import generate_layer_fn
 from ..layer_helper import LayerHelper
-from ..framework import Variable
-from tensor import concat
-from ops import reshape
+import nn
+import ops
+import tensor
 import math
 
 __all__ = [
-    'detection_output',
     'prior_box',
+    'bipartite_match',
+    'target_assign',
+    'detection_output',
+    'ssd_loss',
 ]
 
+__auto__ = [
+    'iou_similarity',
+    'box_coder',
+]
+
+__all__ += __auto__
+
+for _OP in set(__auto__):
+    globals()[_OP] = generate_layer_fn(_OP)
+
 
 def detection_output(scores,
                      loc,
@@ -95,18 +109,13 @@ def detection_output(scores,
     """
 
     helper = LayerHelper("detection_output", **locals())
-    decoded_box = helper.create_tmp_variable(dtype=loc.dtype)
-    helper.append_op(
-        type="box_coder",
-        inputs={
-            'PriorBox': prior_box,
-            'PriorBoxVar': prior_box_var,
-            'TargetBox': loc
-        },
-        outputs={'OutputBox': decoded_box},
-        attrs={'code_type': 'decode_center_size'})
-    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    decoded_box = box_coder(
+        prior_box=prior_box,
+        prior_box_var=prior_box_var,
+        target_box=loc,
+        code_type='decode_center_size')
 
+    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
         inputs={'Scores': scores,
@@ -246,7 +255,7 @@ def prior_box(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = reshape(x=input, shape=new_shape)
+        out = ops.reshape(x=input, shape=new_shape)
         return out
 
     assert isinstance(inputs, list), 'inputs should be a list.'
@@ -322,7 +331,332 @@ def prior_box(inputs,
             reshaped_boxes.append(_reshape_with_axis_(box_results[i], axis=3))
             reshaped_vars.append(_reshape_with_axis_(var_results[i], axis=3))
 
-        box = concat(reshaped_boxes)
-        var = concat(reshaped_vars)
+        box = tensor.concat(reshaped_boxes)
+        var = tensor.concat(reshaped_vars)
 
     return box, var
+
+
+def bipartite_match(dist_matrix, name=None):
+    """
+    **Bipartite matchint operator**
+
+    This operator is a greedy bipartite matching algorithm, which is used to
+    obtain the matching with the maximum distance based on the input
+    distance matrix. For input 2D matrix, the bipartite matching algorithm can
+    find the matched column for each row, also can find the matched row for
+    each column. And this operator only calculate matched indices from column
+    to row. For each instance, the number of matched indices is the number of
+    of columns of the input ditance matrix.
+    
+    There are two outputs to save matched indices and distance.
+    A simple description, this algothrim matched the best (maximum distance)
+    row entity to the column entity and the matched indices are not duplicated
+    in each row of ColToRowMatchIndices. If the column entity is not matched
+    any row entity, set -1 in ColToRowMatchIndices.
+    
+    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+    If Tensor, the height of ColToRowMatchIndices is 1.
+
+    Args:
+        dist_matrix(Variable): This input is a 2-D LoDTensor with shape
+            [K, M]. It is pair-wise distance matrix between the entities
+            represented by each row and each column. For example, assumed one
+            entity is A with shape [K], another entity is B with shape [M]. The
+            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better macthing the pairs are. Please note,
+            This tensor can contain LoD information to represent a batch of
+            inputs. One instance of this batch can contain different numbers of
+            entities.
+    Returns:
+        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
+            N is the batch size. If match_indices[i][j] is -1, it
+            means B[j] does not match any entity in i-th instance.
+            Otherwise, it means B[j] is matched to row
+            match_indices[i][j] in i-th instance. The row number of
+            i-th instance is saved in match_indices[i][j].
+        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
+            N is batch size. If match_indices[i][j] is -1,
+            match_distance[i][j] is also -1.0. Otherwise, assumed
+            match_distance[i][j] = d, and the row offsets of each instance
+            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+    """
+    helper = LayerHelper('bipartite_match', **locals())
+    match_indices = helper.create_tmp_variable(dtype='int32')
+    match_distance = helper.create_tmp_variable(dtype=dist_matrix.dtype)
+    helper.append_op(
+        type='bipartite_match',
+        inputs={'DistMat': dist_matrix},
+        outputs={
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_distance
+        })
+    return match_indices, match_distance
+
+
+def target_assign(input,
+                  matched_indices,
+                  negative_indices=None,
+                  mismatch_value=None,
+                  name=None):
+    """
+    **Target assigner operator**
+
+    This operator can be, for given the target bounding boxes or labels,
+    to assign classification and regression targets to each prediction as well as
+    weights to prediction. The weights is used to specify which prediction would
+    not contribute to training loss.
+    
+    For each instance, the output `out` and`out_weight` are assigned based on
+    `match_indices` and `negative_indices`.
+    Assumed that the row offset for each instance in `input` is called lod,
+    this operator assigns classification/regression targets by performing the
+    following steps:
+    
+    1. Assigning all outpts based on `match_indices`:
+    
+    If id = match_indices[i][j] > 0,
+    
+        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+        out_weight[i][j] = 1.
+    
+    Otherwise, 
+    
+        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+        out_weight[i][j] = 0.
+    
+    2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
+    
+    Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
+    for i-th instance and each `id` of neg_indices in this instance:
+    
+        out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
+        out_weight[i][id] = 1.0
+
+    Args:
+       inputs (Variable): This input is a 3D LoDTensor with shape [M, P, K].
+       matched_indices (Variable): Tensor<int>), The input matched indices
+           is 2D Tenosr<int32> with shape [N, P], If MatchIndices[i][j] is -1,
+           the j-th entity of column is not matched to any entity of row in
+           i-th instance.
+       negative_indices (Variable): The input negative example indices are
+           an optional input with shape [Neg, 1] and int32 type, where Neg is
+           the total number of negative example indices.
+       mismatch_value (float32): Fill this value to the mismatched location.
+
+    Returns:
+       out (Variable): The output is a 3D Tensor with shape [N, P, K],
+           N and P is the same as they are in `neg_indices`, K is the
+           same as it in input of X. If `match_indices[i][j]`.
+       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+    """
+    helper = LayerHelper('target_assign', **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    out_weight = helper.create_tmp_variable(dtype='float32')
+    helper.append_op(
+        type='target_assign',
+        inputs={
+            'X': input,
+            'MatchIndices': matched_indices,
+            'NegIndices': negative_indices
+        },
+        outputs={'Out': out,
+                 'OutWeight': out_weight},
+        attrs={'mismatch_value': mismatch_value})
+    return out, out_weight
+
+
+def ssd_loss(location,
+             confidence,
+             gt_box,
+             gt_label,
+             prior_box,
+             prior_box_var=None,
+             background_label=0,
+             overlap_threshold=0.5,
+             neg_pos_ratio=3.0,
+             neg_overlap=0.5,
+             loc_loss_weight=1.0,
+             conf_loss_weight=1.0,
+             match_type='per_prediction',
+             mining_type='max_negative',
+             sample_size=None):
+    """
+    **Multi-box loss layer for object dection algorithm of SSD**
+
+    This layer is to compute dection loss for SSD given the location offset
+    predictions, confidence predictions, prior boxes and ground-truth boudding
+    boxes and labels, and the type of hard example mining. The returned loss
+    is a weighted sum of the localization loss (or regression loss) and
+    confidence loss (or classification loss) by performing the following steps:
+
+    1. Find matched boundding box by bipartite matching algorithm.
+      1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+      1.2 Compute matched boundding box by bipartite matching algorithm.
+    2. Compute confidence for mining hard examples
+      2.1. Get the target label based on matched indices.
+      2.2. Compute confidence loss.
+    3. Apply hard example mining to get the negative example indices and update
+       the matched indices.
+    4. Assign classification and regression targets
+      4.1. Encoded bbox according to the prior boxes.
+      4.2. Assign regression targets.
+      4.3. Assign classification targets.
+    5. Compute the overall objective loss.
+      5.1 Compute confidence loss.
+      5.1 Compute localization loss.
+      5.3 Compute the overall weighted loss.
+
+    Args:
+        location (Variable): The location predictions are a 3D Tensor with
+            shape [N, Np, 4], N is the batch size, Np is total number of
+            predictions for each instance. 4 is the number of coordinate values,
+            the layout is [xmin, ymin, xmax, ymax].
+        confidence (Variable): The confidence predictions are a 3D Tensor
+            with shape [N, Np, C], N and Np are the same as they are in
+            `location`, C is the class number.
+        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
+            bboxes of mini-batch input.
+        gt_label (Variable): The ground-truth labels are a 2D LoDTensor
+            with shape [Ng, 1].
+        prior_box (Variable): The prior boxes are a 2D Tensor with shape [Np, 4].
+        prior_box_var (Variable): The variance of prior boxes are a 2D Tensor
+            with shape [Np, 4].
+        background_label (int): The index of background label, 0 by default.
+        overlap_threshold (float): If match_type is 'per_prediction', use
+            `overlap_threshold` to determine the extra matching bboxes when
+             finding matched boxes. 0.5 by default.
+        neg_pos_ratio (float): The ratio of the negative boxes to the positive
+            boxes, used only when mining_type is max_negative, 3.0 by defalut.
+        neg_overlap (float): The negative overlap upper bound for the unmatched
+            predictions. Use only when mining_type is max_negative,
+            0.5 by default.
+        sample_size (int): The max sample size of negative box, used only when
+            mining_type is hard_example.
+        loc_loss_weight (float): Weight for localization loss, 1.0 by default.
+        conf_loss_weight (float): Weight for confidence loss, 1.0 by default.
+        match_type (str): The type of matching method during training, should
+            be 'bipartite' or 'per_prediction'.
+        mining_type (str): The hard example mining type, should be 'hard_example'
+            or 'max_negative', now only support `max_negative`.
+
+    Returns:
+        Variable: The weighted sum of the localization loss and confidence loss,
+            with shape [N * Np, 1], N and Np are the same as they are
+            in `location`.
+
+    Raises:
+        ValueError: If mining_type is 'hard_example', now only support
+            mining type of `max_negative`.
+
+    Examples:
+        .. code-block:: python
+
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+            gt_box = layers.data(
+                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+            gt_label = layers.data(
+                name='gt_label', shape=[1], lod_level=1, dtype='float32')
+            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+    """
+
+    helper = LayerHelper('ssd_loss', **locals())
+    if mining_type != 'max_negative':
+        raise ValueError("Only support mining_type == max_negative now.")
+
+    num, num_prior, num_class = confidence.shape
+
+    def __reshape_to_2d(var):
+        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+
+    # 1. Find matched boundding box by prior box.
+    #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+    iou = iou_similarity(x=gt_box, y=prior_box)
+    #   1.2 Compute matched boundding box by bipartite matching algorithm.
+    matched_indices, matched_dist = bipartite_match(iou)
+
+    # 2. Compute confidence for mining hard examples
+    # 2.1. Get the target label based on matched indices
+    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    target_label, _ = target_assign(
+        gt_label, matched_indices, mismatch_value=background_label)
+    # 2.2. Compute confidence loss.
+    # Reshape confidence to 2D tensor.
+    confidence = __reshape_to_2d(confidence)
+    target_label = tensor.cast(x=target_label, dtype='int64')
+    target_label = __reshape_to_2d(target_label)
+    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
+
+    # 3. Mining hard examples
+    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    neg_indices = helper.create_tmp_variable(dtype='int32')
+    dtype = matched_indices.dtype
+    updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='mine_hard_examples',
+        inputs={
+            'ClsLoss': conf_loss,
+            'LocLoss': None,
+            'MatchIndices': matched_indices,
+            'MatchDist': matched_dist,
+        },
+        outputs={
+            'NegIndices': neg_indices,
+            'UpdatedMatchIndices': updated_matched_indices
+        },
+        attrs={
+            'neg_pos_ratio': neg_pos_ratio,
+            'neg_dist_threshold': neg_pos_ratio,
+            'mining_type': mining_type,
+            'sample_size': sample_size,
+        })
+
+    # 4. Assign classification and regression targets
+    # 4.1. Encoded bbox according to the prior boxes.
+    encoded_bbox = box_coder(
+        prior_box=prior_box,
+        prior_box_var=prior_box_var,
+        target_box=gt_box,
+        code_type='encode_center_size')
+    # 4.2. Assign regression targets
+    target_bbox, target_loc_weight = target_assign(
+        encoded_bbox, updated_matched_indices, mismatch_value=background_label)
+    # 4.3. Assign classification targets
+    target_label, target_conf_weight = target_assign(
+        gt_label,
+        updated_matched_indices,
+        negative_indices=neg_indices,
+        mismatch_value=background_label)
+
+    # 5. Compute loss.
+    # 5.1 Compute confidence loss.
+    target_label = __reshape_to_2d(target_label)
+    target_label = tensor.cast(x=target_label, dtype='int64')
+    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
+    target_conf_weight = __reshape_to_2d(target_conf_weight)
+    conf_loss = conf_loss * target_conf_weight
+
+    # 5.2 Compute regression loss.
+    location = __reshape_to_2d(location)
+    target_bbox = __reshape_to_2d(target_bbox)
+
+    loc_loss = nn.smooth_l1(location, target_bbox)
+    target_loc_weight = __reshape_to_2d(target_loc_weight)
+    loc_loss = loc_loss * target_loc_weight
+
+    # 5.3 Compute overall weighted loss.
+    loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
+    return loss
diff --git a/python/paddle/v2/fluid/tests/test_detection.py b/python/paddle/v2/fluid/tests/test_detection.py
index fecc2a622..b731fc9b0 100644
--- a/python/paddle/v2/fluid/tests/test_detection.py
+++ b/python/paddle/v2/fluid/tests/test_detection.py
@@ -13,16 +13,12 @@
 # limitations under the License.
 
 from __future__ import print_function
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.layers.detection as detection
 from paddle.v2.fluid.framework import Program, program_guard
 import unittest
-import numpy as np
 
 
-class TestBook(unittest.TestCase):
+class TestDetection(unittest.TestCase):
     def test_detection_output(self):
         program = Program()
         with program_guard(program):
@@ -49,6 +45,66 @@ class TestBook(unittest.TestCase):
             out = layers.detection_output(
                 scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv)
             self.assertIsNotNone(out)
+            self.assertEqual(out.shape[-1], 6)
+        print(str(program))
+
+    def test_detection_api(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[4], dtype='float32')
+            y = layers.data(name='y', shape=[4], dtype='float32')
+            z = layers.data(name='z', shape=[4], dtype='float32', lod_level=1)
+            iou = layers.iou_similarity(x=x, y=y)
+            bcoder = layers.box_coder(
+                prior_box=x,
+                prior_box_var=y,
+                target_box=z,
+                code_type='encode_center_size')
+            self.assertIsNotNone(iou)
+            self.assertIsNotNone(bcoder)
+
+            matched_indices, matched_dist = layers.bipartite_match(iou)
+            self.assertIsNotNone(matched_indices)
+            self.assertIsNotNone(matched_dist)
+
+            gt = layers.data(
+                name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                gt, matched_indices, mismatch_value=0)
+            self.assertIsNotNone(trg)
+            self.assertIsNotNone(trg_weight)
+
+            gt2 = layers.data(
+                name='gt2', shape=[10, 4], dtype='float32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                gt2, matched_indices, mismatch_value=0)
+            self.assertIsNotNone(trg)
+            self.assertIsNotNone(trg_weight)
+
+        print(str(program))
+
+    def test_ssd_loss(self):
+        program = Program()
+        with program_guard(program):
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
+            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
+            gt_box = layers.data(
+                name='gt_box', shape=[4], lod_level=1, dtype='float32')
+            gt_label = layers.data(
+                name='gt_label', shape=[1], lod_level=1, dtype='int32')
+            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+            self.assertIsNotNone(loss)
+            self.assertEqual(loss.shape[-1], 1)
         print(str(program))
 
 
@@ -62,40 +118,39 @@ class TestPriorBox(unittest.TestCase):
         assert box.shape[1] == 4
 
     def prior_box_output(self, data_shape):
-        images = fluid.layers.data(
-            name='pixel', shape=data_shape, dtype='float32')
-        conv1 = fluid.layers.conv2d(
+        images = layers.data(name='pixel', shape=data_shape, dtype='float32')
+        conv1 = layers.conv2d(
             input=images,
             num_filters=3,
             filter_size=3,
             stride=2,
             use_cudnn=False)
-        conv2 = fluid.layers.conv2d(
+        conv2 = layers.conv2d(
             input=conv1,
             num_filters=3,
             filter_size=3,
             stride=2,
             use_cudnn=False)
-        conv3 = fluid.layers.conv2d(
+        conv3 = layers.conv2d(
             input=conv2,
             num_filters=3,
             filter_size=3,
             stride=2,
             use_cudnn=False)
-        conv4 = fluid.layers.conv2d(
+        conv4 = layers.conv2d(
             input=conv3,
             num_filters=3,
             filter_size=3,
             stride=2,
             use_cudnn=False)
-        conv5 = fluid.layers.conv2d(
+        conv5 = layers.conv2d(
             input=conv4,
             num_filters=3,
             filter_size=3,
             stride=2,
             use_cudnn=False)
 
-        box, var = detection.prior_box(
+        box, var = layers.prior_box(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
             image=images,
             min_ratio=20,
-- 
GitLab